Skip to content

Commit 6a5a941

Browse files
cabirdmexiaoxial
authored andcommitted
bug fixes to GetHash512_16Element
1 parent 29f4826 commit 6a5a941

1 file changed

Lines changed: 16 additions & 22 deletions

File tree

Library/Raisr_AVX512.cpp

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ void GetHashValue_AVX512_32f_16Elements(float GTWG[3][16], int passIdx, int32_t
217217

218218
angle_ps = _mm512_add_ps ( angle_ps, _mm512_mask_blend_ps( _mm512_cmp_ps_mask(angle_ps, zero_ps, _CMP_LT_OQ), zero_ps, _mm512_set1_ps(pi)));
219219

220-
// fast sqrt with 11 bit accuracy
220+
// fast sqrt
221221
__m512 sqrtL1_ps = _mm512_rcp14_ps( _mm512_rsqrt14_ps( L1_ps ));
222222
__m512 sqrtL2_ps = _mm512_rcp14_ps( _mm512_rsqrt14_ps( L2_ps ));
223223

@@ -227,12 +227,8 @@ void GetHashValue_AVX512_32f_16Elements(float GTWG[3][16], int passIdx, int32_t
227227

228228
__m512i angleIdx_epi32 = _mm512_cvtps_epi32( _mm512_floor_ps(_mm512_mul_ps (angle_ps, _mm512_set1_ps(gQAngle))));
229229
__m512i quantAngle_lessone_epi32 = _mm512_sub_epi32(_mm512_set1_epi32(gQuantizationAngle), one_epi32);
230-
angleIdx_epi32 = _mm512_mask_blend_epi32( _mm512_cmp_epi32_mask( angleIdx_epi32, quantAngle_lessone_epi32, _MM_CMPINT_GT),
231-
_mm512_mask_blend_epi32(_mm512_cmp_epi32_mask( angleIdx_epi32, zero_epi32, _MM_CMPINT_LT),
232-
angleIdx_epi32,
233-
zero_epi32),
234-
quantAngle_lessone_epi32);
235-
230+
angleIdx_epi32 = _mm512_min_epi32( _mm512_sub_epi32( _mm512_set1_epi32(gQuantizationAngle), _mm512_set1_epi32(1)),
231+
_mm512_max_epi32(angleIdx_epi32, zero_epi32 ) );
236232

237233
// AFAIK, today QStr & QCoh are vectors of size 2. I think searchsorted can return an index of 0,1, or 2
238234
float *gQStr_data, *gQCoh_data;
@@ -243,22 +239,20 @@ void GetHashValue_AVX512_32f_16Elements(float GTWG[3][16], int passIdx, int32_t
243239
__m512 gQCoh1_ps = _mm512_set1_ps(gQCoh_data[0]);
244240
__m512 gQCoh2_ps = _mm512_set1_ps(gQCoh_data[1]);
245241

246-
__m512i strengthIdx_epi32 = _mm512_mask_blend_epi32(_mm512_cmp_ps_mask(gQStr1_ps, strength_ps, _MM_CMPINT_LE),
247-
zero_epi32,
248-
_mm512_mask_blend_epi32(_mm512_cmp_ps_mask(gQStr2_ps, strength_ps, _MM_CMPINT_LE),
249-
two_epi32,
250-
one_epi32));
251-
__m512i coherenceIdx_epi32 = _mm512_mask_blend_epi32(_mm512_cmp_ps_mask(gQCoh1_ps, coherence_ps, _MM_CMPINT_LE),
252-
zero_epi32,
253-
_mm512_mask_blend_epi32(_mm512_cmp_ps_mask(gQCoh2_ps, coherence_ps, _MM_CMPINT_LE),
254-
two_epi32,
255-
one_epi32));
256-
257-
const __m512i gQuantizationCoherence_epi32 = _mm512_set1_epi32(gQuantizationCoherence);
242+
__m512i strengthIdx_epi32 =
243+
_mm512_add_epi32(
244+
_mm512_mask_blend_epi32(_mm512_cmp_ps_mask(gQStr1_ps, strength_ps, _MM_CMPINT_LE),zero_epi32, one_epi32),
245+
_mm512_mask_blend_epi32(_mm512_cmp_ps_mask(gQStr2_ps, strength_ps, _MM_CMPINT_LE),zero_epi32, one_epi32));
246+
__m512i coherenceIdx_epi32 =
247+
_mm512_add_epi32(
248+
_mm512_mask_blend_epi32(_mm512_cmp_ps_mask(gQCoh1_ps, coherence_ps, _MM_CMPINT_LE),zero_epi32, one_epi32),
249+
_mm512_mask_blend_epi32(_mm512_cmp_ps_mask(gQCoh2_ps, coherence_ps, _MM_CMPINT_LE),zero_epi32, one_epi32));
250+
251+
const __m512i gQuantizationCoherence_epi32 = _mm512_set1_epi32(gQuantizationCoherence);
258252
__m512i idx_epi32 = _mm512_mullo_epi32(gQuantizationCoherence_epi32,
259-
_mm512_mullo_epi32( (angleIdx_epi32), _mm512_set1_epi32(gQuantizationStrength)));
260-
idx_epi32 = _mm512_add_epi32((coherenceIdx_epi32),
261-
_mm512_add_epi32(idx_epi32, _mm512_mullo_epi32((strengthIdx_epi32), gQuantizationCoherence_epi32)));
253+
_mm512_mullo_epi32(angleIdx_epi32, _mm512_set1_epi32(gQuantizationStrength)));
254+
idx_epi32 = _mm512_add_epi32(coherenceIdx_epi32,
255+
_mm512_add_epi32(idx_epi32, _mm512_mullo_epi32(strengthIdx_epi32, gQuantizationCoherence_epi32)));
262256

263257
_mm512_storeu_si512((__m512i *)idx, idx_epi32);
264258
}

0 commit comments

Comments
 (0)