@@ -217,7 +217,7 @@ void GetHashValue_AVX512_32f_16Elements(float GTWG[3][16], int passIdx, int32_t
217217
218218 angle_ps = _mm512_add_ps ( angle_ps, _mm512_mask_blend_ps ( _mm512_cmp_ps_mask (angle_ps, zero_ps, _CMP_LT_OQ), zero_ps, _mm512_set1_ps (pi)));
219219
220- // fast sqrt with 11 bit accuracy
220+ // fast sqrt
221221 __m512 sqrtL1_ps = _mm512_rcp14_ps ( _mm512_rsqrt14_ps ( L1_ps ));
222222 __m512 sqrtL2_ps = _mm512_rcp14_ps ( _mm512_rsqrt14_ps ( L2_ps ));
223223
@@ -227,12 +227,8 @@ void GetHashValue_AVX512_32f_16Elements(float GTWG[3][16], int passIdx, int32_t
227227
228228 __m512i angleIdx_epi32 = _mm512_cvtps_epi32 ( _mm512_floor_ps (_mm512_mul_ps (angle_ps, _mm512_set1_ps (gQAngle ))));
229229 __m512i quantAngle_lessone_epi32 = _mm512_sub_epi32 (_mm512_set1_epi32 (gQuantizationAngle ), one_epi32);
230- angleIdx_epi32 = _mm512_mask_blend_epi32 ( _mm512_cmp_epi32_mask ( angleIdx_epi32, quantAngle_lessone_epi32, _MM_CMPINT_GT),
231- _mm512_mask_blend_epi32 (_mm512_cmp_epi32_mask ( angleIdx_epi32, zero_epi32, _MM_CMPINT_LT),
232- angleIdx_epi32,
233- zero_epi32),
234- quantAngle_lessone_epi32);
235-
230+ angleIdx_epi32 = _mm512_min_epi32 ( _mm512_sub_epi32 ( _mm512_set1_epi32 (gQuantizationAngle ), _mm512_set1_epi32 (1 )),
231+ _mm512_max_epi32 (angleIdx_epi32, zero_epi32 ) );
236232
237233 // AFAIK, today QStr & QCoh are vectors of size 2. I think searchsorted can return an index of 0,1, or 2
238234 float *gQStr_data , *gQCoh_data ;
@@ -243,22 +239,20 @@ void GetHashValue_AVX512_32f_16Elements(float GTWG[3][16], int passIdx, int32_t
243239 __m512 gQCoh1_ps = _mm512_set1_ps (gQCoh_data [0 ]);
244240 __m512 gQCoh2_ps = _mm512_set1_ps (gQCoh_data [1 ]);
245241
246- __m512i strengthIdx_epi32 = _mm512_mask_blend_epi32 (_mm512_cmp_ps_mask (gQStr1_ps , strength_ps, _MM_CMPINT_LE),
247- zero_epi32,
248- _mm512_mask_blend_epi32 (_mm512_cmp_ps_mask (gQStr2_ps , strength_ps, _MM_CMPINT_LE),
249- two_epi32,
250- one_epi32));
251- __m512i coherenceIdx_epi32 = _mm512_mask_blend_epi32 (_mm512_cmp_ps_mask (gQCoh1_ps , coherence_ps, _MM_CMPINT_LE),
252- zero_epi32,
253- _mm512_mask_blend_epi32 (_mm512_cmp_ps_mask (gQCoh2_ps , coherence_ps, _MM_CMPINT_LE),
254- two_epi32,
255- one_epi32));
256-
257- const __m512i gQuantizationCoherence_epi32 = _mm512_set1_epi32 (gQuantizationCoherence );
242+ __m512i strengthIdx_epi32 =
243+ _mm512_add_epi32 (
244+ _mm512_mask_blend_epi32 (_mm512_cmp_ps_mask (gQStr1_ps , strength_ps, _MM_CMPINT_LE),zero_epi32, one_epi32),
245+ _mm512_mask_blend_epi32 (_mm512_cmp_ps_mask (gQStr2_ps , strength_ps, _MM_CMPINT_LE),zero_epi32, one_epi32));
246+ __m512i coherenceIdx_epi32 =
247+ _mm512_add_epi32 (
248+ _mm512_mask_blend_epi32 (_mm512_cmp_ps_mask (gQCoh1_ps , coherence_ps, _MM_CMPINT_LE),zero_epi32, one_epi32),
249+ _mm512_mask_blend_epi32 (_mm512_cmp_ps_mask (gQCoh2_ps , coherence_ps, _MM_CMPINT_LE),zero_epi32, one_epi32));
250+
251+ const __m512i gQuantizationCoherence_epi32 = _mm512_set1_epi32 (gQuantizationCoherence );
258252 __m512i idx_epi32 = _mm512_mullo_epi32 (gQuantizationCoherence_epi32 ,
259- _mm512_mullo_epi32 ( ( angleIdx_epi32) , _mm512_set1_epi32 (gQuantizationStrength )));
260- idx_epi32 = _mm512_add_epi32 (( coherenceIdx_epi32) ,
261- _mm512_add_epi32 (idx_epi32, _mm512_mullo_epi32 (( strengthIdx_epi32) , gQuantizationCoherence_epi32 )));
253+ _mm512_mullo_epi32 (angleIdx_epi32, _mm512_set1_epi32 (gQuantizationStrength )));
254+ idx_epi32 = _mm512_add_epi32 (coherenceIdx_epi32,
255+ _mm512_add_epi32 (idx_epi32, _mm512_mullo_epi32 (strengthIdx_epi32, gQuantizationCoherence_epi32 )));
262256
263257 _mm512_storeu_si512 ((__m512i *)idx, idx_epi32);
264258}
0 commit comments