77*/
88#include "tomcrypt_private.h"
99
10+ #if defined(LTC_GCM_MODE ) || defined(LTC_LRW_MODE )
11+ #if defined(LTC_GCM_PCLMUL )
12+ #if defined(_MSC_VER )
13+ #include <intrin.h>
14+ #else
15+ #include <cpuid.h>
16+ #endif
17+ #include <wmmintrin.h>
18+ #include <smmintrin.h>
19+ #include <emmintrin.h>
20+
21+ static LTC_INLINE int s_pclmul_is_supported (void )
22+ {
23+ static int initialized = 0 , is_supported = 0 ;
24+
25+ if (initialized == 0 ) {
26+ /* Test CPUID.1.0.ECX[1]
27+ * EAX = 1, ECX = 0 */
28+ #if defined(_MSC_VER )
29+ int cpuInfo [4 ];
30+ __cpuid (cpuInfo , 1 );
31+ is_supported = ((cpuInfo [2 ] >> 1 ) & 1 );
32+ #else
33+ int a = 1 , b , c = 0 , d ;
34+
35+ asm volatile ("cpuid"
36+ :"=a" (a ), "=b" (b ), "=c" (c ), "=d" (d )
37+ :"a" (a ), "c" (c )
38+ );
39+
40+ is_supported = ((c >> 1 ) & 1 );
41+ initialized = 1 ;
42+ #endif
43+ }
44+
45+ return is_supported ;
46+ }
47+
48+ /*
49+ * 128x128-bit binary polynomial multiplication for Intel x86 and x86_64
50+ * Based on "Intel Carry-Less Multiplication Instruction and its Usage for
51+ * Computing the GCM Mode", Shay Gueron, Michael E. Kounavis
52+ * https://cdrdv2-public.intel.com/836172/clmul-wp-rev-2-02-2014-04-20.pdf
53+ */
54+ LTC_GCM_PCLMUL_TARGET
55+ static void s_gfmul_pclmul (__m128i a , __m128i b , __m128i * res ){
56+ /* Page 25. Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
57+ __m128i /*tmp0, tmp1,*/ tmp2 , tmp3 , tmp4 , tmp5 , tmp6 , tmp7 , tmp8 , tmp9 ;
58+ tmp3 = _mm_clmulepi64_si128 (a , b , 0x00 );
59+ tmp4 = _mm_clmulepi64_si128 (a , b , 0x10 );
60+ tmp5 = _mm_clmulepi64_si128 (a , b , 0x01 );
61+ tmp6 = _mm_clmulepi64_si128 (a , b , 0x11 );
62+ tmp4 = _mm_xor_si128 (tmp4 , tmp5 );
63+ tmp5 = _mm_slli_si128 (tmp4 , 8 );
64+ tmp4 = _mm_srli_si128 (tmp4 , 8 );
65+ tmp3 = _mm_xor_si128 (tmp3 , tmp5 );
66+ tmp6 = _mm_xor_si128 (tmp6 , tmp4 );
67+ tmp7 = _mm_srli_epi32 (tmp3 , 31 );
68+ tmp8 = _mm_srli_epi32 (tmp6 , 31 );
69+ tmp3 = _mm_slli_epi32 (tmp3 , 1 );
70+ tmp6 = _mm_slli_epi32 (tmp6 , 1 );
71+ tmp9 = _mm_srli_si128 (tmp7 , 12 );
72+ tmp8 = _mm_slli_si128 (tmp8 , 4 );
73+ tmp7 = _mm_slli_si128 (tmp7 , 4 );
74+ tmp3 = _mm_or_si128 (tmp3 , tmp7 );
75+ tmp6 = _mm_or_si128 (tmp6 , tmp8 );
76+ tmp6 = _mm_or_si128 (tmp6 , tmp9 );
77+ tmp7 = _mm_slli_epi32 (tmp3 , 31 );
78+ tmp8 = _mm_slli_epi32 (tmp3 , 30 );
79+ tmp9 = _mm_slli_epi32 (tmp3 , 25 );
80+ tmp7 = _mm_xor_si128 (tmp7 , tmp8 );
81+ tmp7 = _mm_xor_si128 (tmp7 , tmp9 );
82+ tmp8 = _mm_srli_si128 (tmp7 , 4 );
83+ tmp7 = _mm_slli_si128 (tmp7 , 12 );
84+ tmp3 = _mm_xor_si128 (tmp3 , tmp7 );
85+ tmp2 = _mm_srli_epi32 (tmp3 , 1 );
86+ tmp4 = _mm_srli_epi32 (tmp3 , 2 );
87+ tmp5 = _mm_srli_epi32 (tmp3 , 7 );
88+ tmp2 = _mm_xor_si128 (tmp2 , tmp4 );
89+ tmp2 = _mm_xor_si128 (tmp2 , tmp5 );
90+ tmp2 = _mm_xor_si128 (tmp2 , tmp8 );
91+ tmp3 = _mm_xor_si128 (tmp3 , tmp2 );
92+ tmp6 = _mm_xor_si128 (tmp6 , tmp3 );
93+ * res = tmp6 ;
94+ }
95+
96+ LTC_GCM_PCLMUL_TARGET
97+ static void s_gcm_gf_mult_pclmul (const unsigned char * a , const unsigned char * b , unsigned char * c )
98+ {
99+ __m128i ci ;
100+ __m128i BSWAP_MASK = _mm_set_epi8 (0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 );
101+ __m128i ai = _mm_loadu_si128 ((const __m128i * ) a );
102+ __m128i bi = _mm_loadu_si128 ((const __m128i * ) b );
103+
104+ ai = _mm_shuffle_epi8 (ai , BSWAP_MASK );
105+ bi = _mm_shuffle_epi8 (bi , BSWAP_MASK );
106+
107+ s_gfmul_pclmul (ai , bi , & ci );
108+
109+ ci = _mm_shuffle_epi8 (ci , BSWAP_MASK );
110+
111+ XMEMCPY (c , & ci , sizeof (ci ));
112+ }
113+ #endif /* defined(LTC_GCM_PCLMUL) */
114+
115+ #if defined(LTC_GCM_PMULL )
116+ #if defined(__GNUC__ )
117+ #pragma GCC diagnostic push
118+ #pragma GCC diagnostic ignored "-Wbad-function-cast"
119+ #pragma GCC diagnostic ignored "-Wunused-parameter"
120+ #pragma GCC diagnostic ignored "-Wmissing-braces"
121+ #pragma GCC diagnostic ignored "-Wsign-compare"
122+ #pragma GCC diagnostic ignored "-Wshadow"
123+ #endif
124+ #include <arm_neon.h>
125+ #if defined(__GNUC__ )
126+ #pragma GCC diagnostic pop
127+ #endif
128+
129+ #if defined(__APPLE__ )
130+ #include <sys/sysctl.h>
131+ #elif defined(_WIN32 )
132+ #include <windows.h>
133+ #else
134+ #include <sys/auxv.h>
135+ #include <asm/hwcap.h>
136+ #endif
137+
138+ static LTC_INLINE int s_pmull_is_supported (void )
139+ {
140+ static int initialized = 0 , is_supported = 0 ;
141+
142+ if (initialized == 0 ) {
143+ #if defined(__APPLE__ )
144+ int val = 0 ;
145+ size_t len = sizeof (val );
146+ if (sysctlbyname ("hw.optional.arm.FEAT_PMULL" , & val , & len , NULL , 0 ) == 0 ) {
147+ is_supported = (val != 0 );
148+ }
149+ #elif defined (_WIN32 )
150+ is_supported = IsProcessorFeaturePresent (PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE );
151+ #else
152+ unsigned long hwcaps = getauxval (AT_HWCAP );
153+ is_supported = (hwcaps & HWCAP_PMULL );
154+ #endif
155+ initialized = 1 ;
156+ }
157+
158+ return is_supported ;
159+ }
160+
161+ /*
162+ * 128x128-bit binary polynomial multiplication for AArch64 using PMULL/PMULL2
163+ * Based on "Implementing GCM on ARMv8", Conrado P. L. Gouvea and Julio Lopez
164+ * https://conradoplg.modp.net/files/2010/12/gcm14.pdf
165+ */
166+ #if defined(_MSC_VER )
167+ #define GET_LOW_P64 (x ) vreinterpret_p64_u64(vcreate_u64((uint64_t)vgetq_lane_p64((x), 0)))
168+ #else
169+ #define GET_LOW_P64 (x ) vgetq_lane_p64((x), 0)
170+ #endif
171+
172+ LTC_GCM_PMULL_TARGET
173+ static void s_gfmul_pmull (uint8x16_t a , uint8x16_t b , uint8x16_t * res ) {
174+ uint8x16_t r0 , r1 , t0 , t1 , z , p ;
175+ poly64x2_t pa , pb , pt0 , pr1 , pp ;
176+
177+ z = vdupq_n_u8 (0 );
178+
179+ pa = vreinterpretq_p64_u8 (a );
180+ pb = vreinterpretq_p64_u8 (b );
181+
182+ /* Page 7. Algorithm 3 128 x 128-bit binary polynomial multiplier for ARMv8 AArch64 (PMULL) */
183+ r0 = vreinterpretq_u8_p128 (vmull_p64 (GET_LOW_P64 (pa ), GET_LOW_P64 (pb )));
184+ r1 = vreinterpretq_u8_p128 (vmull_high_p64 (pa , pb ));
185+ t0 = vextq_u8 (b , b , 8 );
186+ pt0 = vreinterpretq_p64_u8 (t0 );
187+
188+ t1 = vreinterpretq_u8_p128 (vmull_p64 (GET_LOW_P64 (pa ), GET_LOW_P64 (pt0 )));
189+ t0 = vreinterpretq_u8_p128 (vmull_high_p64 (pa , pt0 ));
190+ t0 = veorq_u8 (t0 , t1 );
191+ t1 = vextq_u8 (z , t0 , 8 );
192+ r0 = veorq_u8 (r0 , t1 );
193+ t1 = vextq_u8 (t0 , z , 8 );
194+ r1 = veorq_u8 (r1 , t1 );
195+
196+ /* Page 8. Algorithm 5 256-bit to 128-bit GCM polynomial reduction for ARMv8 AAarch64 using PMULL */
197+ p = vreinterpretq_u8_u64 (vdupq_n_u64 (0x0000000000000087ULL ));
198+ pp = vreinterpretq_p64_u8 (p );
199+ pr1 = vreinterpretq_p64_u8 (r1 );
200+ t0 = vreinterpretq_u8_p128 (vmull_high_p64 (pr1 , pp ));
201+ t1 = vextq_u8 (t0 , z , 8 );
202+ r1 = veorq_u8 (r1 , t1 );
203+ t1 = vextq_u8 (z , t0 , 8 );
204+ r0 = veorq_u8 (r0 , t1 );
205+ pr1 = vreinterpretq_p64_u8 (r1 );
206+
207+ t0 = vreinterpretq_u8_p128 (vmull_p64 (GET_LOW_P64 (pr1 ), GET_LOW_P64 (pp )));
208+ a = veorq_u8 (r0 , t0 );
209+
210+ * res = a ;
211+ }
212+
213+ LTC_GCM_PMULL_TARGET
214+ static void s_gcm_gf_mult_pmull (const unsigned char * a , const unsigned char * b , unsigned char * c )
215+ {
216+ uint8x16_t va , vb , vc ;
217+
218+ va = vld1q_u8 (a );
219+ vb = vld1q_u8 (b );
220+ va = vrbitq_u8 (va );
221+ vb = vrbitq_u8 (vb );
222+
223+ s_gfmul_pmull (va , vb , & vc );
224+
225+ vc = vrbitq_u8 (vc );
226+
227+ XMEMCPY (c , & vc , sizeof (vc ));
228+ }
229+
230+ #endif /* defined(LTC_GCM_PMULL) */
231+ #endif /* defined(LTC_GCM_MODE) || defined(LTC_LRW_MODE) */
232+
10233#if defined(LTC_GCM_TABLES ) || defined(LTC_LRW_TABLES ) || (defined(LTC_GCM_MODE ) && defined(LTC_FAST ))
11234
12235/* this is x*2^128 mod p(x) ... the results are 16 bytes each stored in a packed format. Since only the
@@ -50,6 +273,7 @@ const unsigned char gcm_shift_table[256*2] = {
50273
51274#if defined(LTC_GCM_MODE ) || defined(LTC_LRW_MODE )
52275
276+
53277#ifndef LTC_FAST
54278/* right shift */
55279static void s_gcm_rightshift (unsigned char * a )
@@ -72,7 +296,7 @@ static const unsigned char poly[] = { 0x00, 0xE1 };
72296 @param b Second value
73297 @param c Destination for a * b
74298 */
75- void gcm_gf_mult (const unsigned char * a , const unsigned char * b , unsigned char * c )
299+ static void s_gcm_gf_mult_sw (const unsigned char * a , const unsigned char * b , unsigned char * c )
76300{
77301 unsigned char Z [16 ], V [16 ];
78302 unsigned char x , y , z ;
@@ -106,7 +330,7 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *
106330 @param b Second value
107331 @param c Destination for a * b
108332 */
109- void gcm_gf_mult (const unsigned char * a , const unsigned char * b , unsigned char * c )
333+ static void s_gcm_gf_mult_sw (const unsigned char * a , const unsigned char * b , unsigned char * c )
110334{
111335 int i , j , k , u ;
112336 LTC_FAST_TYPE B [16 ][WPV ], tmp [32 / sizeof (LTC_FAST_TYPE )], pB [16 / sizeof (LTC_FAST_TYPE )], zz , z ;
@@ -209,5 +433,29 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *
209433
210434#endif
211435
436+ /**
437+ GCM GF multiplier (internal use only)
438+ @param a First value
439+ @param b Second value
440+ @param c Destination for a * b
441+ */
442+ void gcm_gf_mult (const unsigned char * a , const unsigned char * b , unsigned char * c )
443+ {
444+ #if defined(LTC_GCM_PCLMUL )
445+ if (s_pclmul_is_supported ()) {
446+ s_gcm_gf_mult_pclmul (a , b , c );
447+ return ;
448+ }
449+ #endif
450+ #if defined(LTC_GCM_PMULL )
451+ if (s_pmull_is_supported ()) {
452+ s_gcm_gf_mult_pmull (a , b , c );
453+ return ;
454+ }
455+ #endif
456+ s_gcm_gf_mult_sw (a , b , c );
457+ }
458+
459+
212460#endif
213461
0 commit comments