Skip to content

Commit c8de4fb

Browse files
cabirdmexiaoxial
authored andcommitted
GetHashFP16 we multiply/scale values by 100 because they are too small otherwise.
1 parent 0bc8f0a commit c8de4fb

1 file changed

Lines changed: 15 additions & 0 deletions

File tree

Library/Raisr_AVX512FP16.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,12 @@ void GetHashValue_AVX512FP16_16h_8Elements(_Float16 GTWG[3][32], int passIdx, in
399399
__m128h m_b_ph = _mm_load_ph(GTWG[1]);
400400
__m128h m_d_ph = _mm_load_ph(GTWG[2]);
401401

402+
// GTWG values are very small. Let's multiply by 100 to make them bigger (scaled)
403+
__m128h c = _mm_set1_ph(100);
404+
m_a_ph = _mm_mul_ph(m_a_ph, c);
405+
m_b_ph = _mm_mul_ph(m_b_ph, c);
406+
m_d_ph = _mm_mul_ph(m_d_ph, c);
407+
402408
__m128h T_ph = _mm_add_ph(m_a_ph, m_d_ph);
403409
__m128h D_ph = _mm_sub_ph( _mm_mul_ph( m_a_ph, m_d_ph),
404410
_mm_mul_ph( m_b_ph, m_b_ph));
@@ -431,6 +437,7 @@ void GetHashValue_AVX512FP16_16h_8Elements(_Float16 GTWG[3][32], int passIdx, in
431437
__m128h coherence_ph = _mm_div_ph( _mm_sub_ph( sqrtL1_ph, sqrtL2_ph ),
432438
_mm_add_ph( _mm_add_ph(sqrtL1_ph, sqrtL2_ph), _mm_set1_ph(near_zero) ) );
433439
__m128h strength_ph = L1_ph;
440+
strength_ph = _mm_div_ph(strength_ph, c); // scale back because we scaled it up earlier
434441

435442
__m128i angleIdx_epi16 = floor_ph_128( _mm_mul_ph (angle_ph, _mm_set1_ph(gQAngle)));
436443
__m128i quantAngle_lessone_epi16 = _mm_sub_epi16(_mm_set1_epi16(gQuantizationAngle), one_epi16);
@@ -507,7 +514,14 @@ void GetHashValue_AVX512FP16_16h_32Elements(_Float16 GTWG[3][32], int passIdx, i
507514
__m512h m_b_ph = _mm512_load_ph( GTWG[1]);
508515
__m512h m_d_ph = _mm512_load_ph( GTWG[2]);
509516

517+
// GTWG values are very small. Let's multiply by 100 to make them bigger (scaled)
518+
__m512h c = _mm512_set1_ph(100);
519+
m_a_ph = _mm512_mul_ph(m_a_ph, c);
520+
m_b_ph = _mm512_mul_ph(m_b_ph, c);
521+
m_d_ph = _mm512_mul_ph(m_d_ph, c);
522+
510523
__m512h T_ph = _mm512_add_ph(m_a_ph, m_d_ph);
524+
511525
__m512h D_ph = _mm512_sub_ph( _mm512_mul_ph( m_a_ph, m_d_ph),
512526
_mm512_mul_ph( m_b_ph, m_b_ph));
513527

@@ -539,6 +553,7 @@ void GetHashValue_AVX512FP16_16h_32Elements(_Float16 GTWG[3][32], int passIdx, i
539553
__m512h coherence_ph = _mm512_div_ph( _mm512_sub_ph( sqrtL1_ph, sqrtL2_ph ),
540554
_mm512_add_ph( _mm512_add_ph(sqrtL1_ph, sqrtL2_ph), _mm512_set1_ph(near_zero) ) );
541555
__m512h strength_ph = L1_ph;
556+
strength_ph = _mm512_div_ph(strength_ph, c);
542557

543558
__m512i angleIdx_epi16 = floor_ph_512(_mm512_mul_ph (angle_ph, _mm512_set1_ph(gQAngle)));
544559

0 commit comments

Comments
 (0)