Skip to content

Commit ecc3686

Browse files
authored
Merge pull request #714 from sjlombardo/develop
Optimize gcm_gf_mult using PCLMULQDQ and PMULL
2 parents 6ad2ce9 + 93eed8d commit ecc3686

3 files changed

Lines changed: 277 additions & 5 deletions

File tree

.github/workflows/main.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ jobs:
139139
os: [ ubuntu-22.04, ubuntu-24.04, ubuntu-22.04-arm, ubuntu-24.04-arm ]
140140
build_type: [ '', -DCMAKE_BUILD_TYPE=Debug, -DCMAKE_BUILD_TYPE=Release, -DCMAKE_BUILD_TYPE=RelWithDebInfo, -DCMAKE_BUILD_TYPE=MinSizeRel ]
141141
cc: [ clang, gcc ]
142+
cflags: [ '', '-DLTC_CFLAGS="-DLTC_NO_ASM"']
142143
config:
143144
# Static library build
144145
- { CMAKEOPTIONS: '-DBUILD_SHARED_LIBS=Off' }
@@ -161,12 +162,12 @@ jobs:
161162
run: |
162163
mkdir build
163164
cd build
164-
CC=${{ matrix.cc }} cmake ${{ matrix.config.CMAKEOPTIONS }} ${{ matrix.build_type }} ..
165+
CC=${{ matrix.cc }} cmake ${{ matrix.config.CMAKEOPTIONS }} ${{ matrix.build_type }} ${{ matrix.cflags }} ..
165166
make -j$(nproc)
166167
- name: test
167168
run: |
168169
cd build
169-
CC=${{ matrix.cc }} cmake ${{ matrix.config.CMAKEOPTIONS }} ${{ matrix.build_type }} -DBUILD_TESTING=On ..
170+
CC=${{ matrix.cc }} cmake ${{ matrix.config.CMAKEOPTIONS }} ${{ matrix.build_type }} ${{ matrix.cflags }} -DBUILD_TESTING=On ..
170171
make -j$(nproc)
171172
ctest
172173
- name: error logs

src/encauth/gcm/gcm_gf_mult.c

Lines changed: 250 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,229 @@
77
*/
88
#include "tomcrypt_private.h"
99

10+
#if defined(LTC_GCM_MODE) || defined(LTC_LRW_MODE)
11+
#if defined(LTC_GCM_PCLMUL)
12+
#if defined(_MSC_VER)
13+
#include <intrin.h>
14+
#else
15+
#include <cpuid.h>
16+
#endif
17+
#include <wmmintrin.h>
18+
#include <smmintrin.h>
19+
#include <emmintrin.h>
20+
21+
static LTC_INLINE int s_pclmul_is_supported(void)
22+
{
23+
static int initialized = 0, is_supported = 0;
24+
25+
if (initialized == 0) {
26+
/* Test CPUID.1.0.ECX[1]
27+
* EAX = 1, ECX = 0 */
28+
#if defined(_MSC_VER)
29+
int cpuInfo[4];
30+
__cpuid(cpuInfo, 1);
31+
is_supported = ((cpuInfo[2] >> 1) & 1);
32+
#else
33+
int a = 1 , b, c = 0, d;
34+
35+
asm volatile ("cpuid"
36+
:"=a"(a), "=b"(b), "=c"(c), "=d"(d)
37+
:"a"(a), "c"(c)
38+
);
39+
40+
is_supported = ((c >> 1) & 1);
41+
initialized = 1;
42+
#endif
43+
}
44+
45+
return is_supported;
46+
}
47+
48+
/*
49+
* 128x128-bit binary polynomial multiplication for Intel x86 and x86_64
50+
* Based on "Intel Carry-Less Multiplication Instruction and its Usage for
51+
* Computing the GCM Mode", Shay Gueron, Michael E. Kounavis
52+
* https://cdrdv2-public.intel.com/836172/clmul-wp-rev-2-02-2014-04-20.pdf
53+
*/
54+
LTC_GCM_PCLMUL_TARGET
55+
static void s_gfmul_pclmul(__m128i a, __m128i b, __m128i *res){
56+
/* Page 25. Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
57+
__m128i /*tmp0, tmp1,*/ tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
58+
tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
59+
tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
60+
tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
61+
tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
62+
tmp4 = _mm_xor_si128(tmp4, tmp5);
63+
tmp5 = _mm_slli_si128(tmp4, 8);
64+
tmp4 = _mm_srli_si128(tmp4, 8);
65+
tmp3 = _mm_xor_si128(tmp3, tmp5);
66+
tmp6 = _mm_xor_si128(tmp6, tmp4);
67+
tmp7 = _mm_srli_epi32(tmp3, 31);
68+
tmp8 = _mm_srli_epi32(tmp6, 31);
69+
tmp3 = _mm_slli_epi32(tmp3, 1);
70+
tmp6 = _mm_slli_epi32(tmp6, 1);
71+
tmp9 = _mm_srli_si128(tmp7, 12);
72+
tmp8 = _mm_slli_si128(tmp8, 4);
73+
tmp7 = _mm_slli_si128(tmp7, 4);
74+
tmp3 = _mm_or_si128(tmp3, tmp7);
75+
tmp6 = _mm_or_si128(tmp6, tmp8);
76+
tmp6 = _mm_or_si128(tmp6, tmp9);
77+
tmp7 = _mm_slli_epi32(tmp3, 31);
78+
tmp8 = _mm_slli_epi32(tmp3, 30);
79+
tmp9 = _mm_slli_epi32(tmp3, 25);
80+
tmp7 = _mm_xor_si128(tmp7, tmp8);
81+
tmp7 = _mm_xor_si128(tmp7, tmp9);
82+
tmp8 = _mm_srli_si128(tmp7, 4);
83+
tmp7 = _mm_slli_si128(tmp7, 12);
84+
tmp3 = _mm_xor_si128(tmp3, tmp7);
85+
tmp2 = _mm_srli_epi32(tmp3, 1);
86+
tmp4 = _mm_srli_epi32(tmp3, 2);
87+
tmp5 = _mm_srli_epi32(tmp3, 7);
88+
tmp2 = _mm_xor_si128(tmp2, tmp4);
89+
tmp2 = _mm_xor_si128(tmp2, tmp5);
90+
tmp2 = _mm_xor_si128(tmp2, tmp8);
91+
tmp3 = _mm_xor_si128(tmp3, tmp2);
92+
tmp6 = _mm_xor_si128(tmp6, tmp3);
93+
*res = tmp6;
94+
}
95+
96+
LTC_GCM_PCLMUL_TARGET
97+
static void s_gcm_gf_mult_pclmul(const unsigned char *a, const unsigned char *b, unsigned char *c)
98+
{
99+
__m128i ci;
100+
__m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
101+
__m128i ai = _mm_loadu_si128((const __m128i *) a);
102+
__m128i bi = _mm_loadu_si128((const __m128i *) b);
103+
104+
ai = _mm_shuffle_epi8(ai, BSWAP_MASK);
105+
bi = _mm_shuffle_epi8(bi, BSWAP_MASK);
106+
107+
s_gfmul_pclmul(ai, bi, &ci);
108+
109+
ci = _mm_shuffle_epi8(ci, BSWAP_MASK);
110+
111+
XMEMCPY(c, &ci, sizeof(ci));
112+
}
113+
#endif /* defined(LTC_GCM_PCLMUL) */
114+
115+
#if defined(LTC_GCM_PMULL)
116+
#if defined(__GNUC__)
117+
#pragma GCC diagnostic push
118+
#pragma GCC diagnostic ignored "-Wbad-function-cast"
119+
#pragma GCC diagnostic ignored "-Wunused-parameter"
120+
#pragma GCC diagnostic ignored "-Wmissing-braces"
121+
#pragma GCC diagnostic ignored "-Wsign-compare"
122+
#pragma GCC diagnostic ignored "-Wshadow"
123+
#endif
124+
#include <arm_neon.h>
125+
#if defined(__GNUC__)
126+
#pragma GCC diagnostic pop
127+
#endif
128+
129+
#if defined(__APPLE__)
130+
#include <sys/sysctl.h>
131+
#elif defined(_WIN32)
132+
#include <windows.h>
133+
#else
134+
#include <sys/auxv.h>
135+
#include <asm/hwcap.h>
136+
#endif
137+
138+
static LTC_INLINE int s_pmull_is_supported(void)
139+
{
140+
static int initialized = 0, is_supported = 0;
141+
142+
if (initialized == 0) {
143+
#if defined(__APPLE__)
144+
int val = 0;
145+
size_t len = sizeof(val);
146+
if (sysctlbyname("hw.optional.arm.FEAT_PMULL", &val, &len, NULL, 0) == 0) {
147+
is_supported = (val != 0);
148+
}
149+
#elif defined (_WIN32)
150+
is_supported = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
151+
#else
152+
unsigned long hwcaps = getauxval(AT_HWCAP);
153+
is_supported = (hwcaps & HWCAP_PMULL);
154+
#endif
155+
initialized = 1;
156+
}
157+
158+
return is_supported;
159+
}
160+
161+
/*
162+
* 128x128-bit binary polynomial multiplication for AArch64 using PMULL/PMULL2
163+
* Based on "Implementing GCM on ARMv8", Conrado P. L. Gouvea and Julio Lopez
164+
* https://conradoplg.modp.net/files/2010/12/gcm14.pdf
165+
*/
166+
#if defined(_MSC_VER)
167+
#define GET_LOW_P64(x) vreinterpret_p64_u64(vcreate_u64((uint64_t)vgetq_lane_p64((x), 0)))
168+
#else
169+
#define GET_LOW_P64(x) vgetq_lane_p64((x), 0)
170+
#endif
171+
172+
LTC_GCM_PMULL_TARGET
173+
static void s_gfmul_pmull(uint8x16_t a, uint8x16_t b, uint8x16_t *res) {
174+
uint8x16_t r0, r1, t0, t1, z, p;
175+
poly64x2_t pa, pb, pt0, pr1, pp;
176+
177+
z = vdupq_n_u8(0);
178+
179+
pa = vreinterpretq_p64_u8(a);
180+
pb = vreinterpretq_p64_u8(b);
181+
182+
/* Page 7. Algorithm 3 128 x 128-bit binary polynomial multiplier for ARMv8 AArch64 (PMULL) */
183+
r0 = vreinterpretq_u8_p128(vmull_p64(GET_LOW_P64(pa), GET_LOW_P64(pb)));
184+
r1 = vreinterpretq_u8_p128(vmull_high_p64(pa, pb));
185+
t0 = vextq_u8(b, b, 8);
186+
pt0 = vreinterpretq_p64_u8(t0);
187+
188+
t1 = vreinterpretq_u8_p128(vmull_p64(GET_LOW_P64(pa), GET_LOW_P64(pt0)));
189+
t0 = vreinterpretq_u8_p128(vmull_high_p64(pa, pt0));
190+
t0 = veorq_u8(t0, t1);
191+
t1 = vextq_u8(z, t0, 8);
192+
r0 = veorq_u8(r0, t1);
193+
t1 = vextq_u8(t0, z, 8);
194+
r1 = veorq_u8(r1, t1);
195+
196+
/* Page 8. Algorithm 5 256-bit to 128-bit GCM polynomial reduction for ARMv8 AAarch64 using PMULL */
197+
p = vreinterpretq_u8_u64(vdupq_n_u64(0x0000000000000087ULL));
198+
pp = vreinterpretq_p64_u8(p);
199+
pr1 = vreinterpretq_p64_u8(r1);
200+
t0 = vreinterpretq_u8_p128(vmull_high_p64(pr1, pp));
201+
t1 = vextq_u8(t0, z, 8);
202+
r1 = veorq_u8(r1, t1);
203+
t1 = vextq_u8(z, t0, 8);
204+
r0 = veorq_u8(r0, t1);
205+
pr1 = vreinterpretq_p64_u8(r1);
206+
207+
t0 = vreinterpretq_u8_p128(vmull_p64(GET_LOW_P64(pr1), GET_LOW_P64(pp)));
208+
a = veorq_u8(r0, t0);
209+
210+
*res = a;
211+
}
212+
213+
LTC_GCM_PMULL_TARGET
214+
static void s_gcm_gf_mult_pmull(const unsigned char *a, const unsigned char *b, unsigned char *c)
215+
{
216+
uint8x16_t va, vb, vc;
217+
218+
va = vld1q_u8(a);
219+
vb = vld1q_u8(b);
220+
va = vrbitq_u8(va);
221+
vb = vrbitq_u8(vb);
222+
223+
s_gfmul_pmull(va, vb, &vc);
224+
225+
vc = vrbitq_u8(vc);
226+
227+
XMEMCPY(c, &vc, sizeof(vc));
228+
}
229+
230+
#endif /* defined(LTC_GCM_PMULL) */
231+
#endif /* defined(LTC_GCM_MODE) || defined(LTC_LRW_MODE) */
232+
10233
#if defined(LTC_GCM_TABLES) || defined(LTC_LRW_TABLES) || (defined(LTC_GCM_MODE) && defined(LTC_FAST))
11234

12235
/* this is x*2^128 mod p(x) ... the results are 16 bytes each stored in a packed format. Since only the
@@ -50,6 +273,7 @@ const unsigned char gcm_shift_table[256*2] = {
50273

51274
#if defined(LTC_GCM_MODE) || defined(LTC_LRW_MODE)
52275

276+
53277
#ifndef LTC_FAST
54278
/* right shift */
55279
static void s_gcm_rightshift(unsigned char *a)
@@ -72,7 +296,7 @@ static const unsigned char poly[] = { 0x00, 0xE1 };
72296
@param b Second value
73297
@param c Destination for a * b
74298
*/
75-
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
299+
static void s_gcm_gf_mult_sw(const unsigned char *a, const unsigned char *b, unsigned char *c)
76300
{
77301
unsigned char Z[16], V[16];
78302
unsigned char x, y, z;
@@ -106,7 +330,7 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *
106330
@param b Second value
107331
@param c Destination for a * b
108332
*/
109-
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
333+
static void s_gcm_gf_mult_sw(const unsigned char *a, const unsigned char *b, unsigned char *c)
110334
{
111335
int i, j, k, u;
112336
LTC_FAST_TYPE B[16][WPV], tmp[32 / sizeof(LTC_FAST_TYPE)], pB[16 / sizeof(LTC_FAST_TYPE)], zz, z;
@@ -209,5 +433,29 @@ void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *
209433

210434
#endif
211435

436+
/**
437+
GCM GF multiplier (internal use only)
438+
@param a First value
439+
@param b Second value
440+
@param c Destination for a * b
441+
*/
442+
void gcm_gf_mult(const unsigned char *a, const unsigned char *b, unsigned char *c)
443+
{
444+
#if defined(LTC_GCM_PCLMUL)
445+
if(s_pclmul_is_supported()) {
446+
s_gcm_gf_mult_pclmul(a, b, c);
447+
return;
448+
}
449+
#endif
450+
#if defined(LTC_GCM_PMULL)
451+
if(s_pmull_is_supported()) {
452+
s_gcm_gf_mult_pmull(a, b, c);
453+
return;
454+
}
455+
#endif
456+
s_gcm_gf_mult_sw(a, b, c);
457+
}
458+
459+
212460
#endif
213461

src/headers/tomcrypt_cfg.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ LTC_EXPORT void LTC_CALL XFREE(void *p);
3131

3232
LTC_EXPORT void LTC_CALL XQSORT(void *base, size_t nmemb, size_t size, int(*compar)(const void *, const void *));
3333

34-
3534
/* change the clock function too */
3635
LTC_EXPORT clock_t LTC_CALL XCLOCK(void);
3736

@@ -250,6 +249,8 @@ typedef unsigned long ltc_mp_digit;
250249
#define LTC_NO_CTZL
251250
#define LTC_NO_ROLC
252251
#define LTC_NO_ROTATE
252+
#define LTC_NO_GCM_PCLMUL
253+
#define LTC_NO_GCM_PMULL
253254
#endif
254255

255256
/* No LTC_FAST if: explicitly disabled OR non-gcc/non-clang compiler OR old gcc OR using -ansi -std=c99 */
@@ -367,4 +368,26 @@ typedef unsigned long ltc_mp_digit;
367368
# define LTC_ATTRIBUTE(x)
368369
#endif
369370

371+
#if !defined(LTC_NO_GCM_PCLMUL) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_IX86))
372+
#define LTC_GCM_PCLMUL
373+
#undef LTC_GCM_TABLES
374+
#endif
375+
376+
#if defined(__clang__) || defined(__GNUC__)
377+
#define LTC_GCM_PCLMUL_TARGET __attribute__((target("pclmul,ssse3")))
378+
#else
379+
#define LTC_GCM_PCLMUL_TARGET
380+
#endif
381+
382+
#if !defined(LTC_NO_GCM_PMULL) && (defined(__aarch64__) || defined(_M_ARM64))
383+
#define LTC_GCM_PMULL
384+
#undef LTC_GCM_TABLES
385+
#endif
386+
387+
#if defined(LTC_GCM_PMULL) && (defined(__clang__) || defined(__GNUC__))
388+
#define LTC_GCM_PMULL_TARGET __attribute__((target("+crypto")))
389+
#else
390+
#define LTC_GCM_PMULL_TARGET
391+
#endif
392+
370393
#endif /* TOMCRYPT_CFG_H */

0 commit comments

Comments
 (0)