Skip to content

Commit 0bc8f0a

Browse files
cabirdmexiaoxial
authored andcommitted
bug fix in floor function
1 parent 2488483 commit 0bc8f0a

1 file changed

Lines changed: 9 additions & 13 deletions

File tree

Library/Raisr_AVX512FP16.cpp

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,22 +11,18 @@
1111
#include <cmath>
1212
#include<string.h>
1313

14-
inline __m512h floor_ph_512(__m512h val_ph)
14+
inline __m512i floor_ph_512(__m512h val_ph)
1515
{
16-
__m512h ret_ph;
17-
#ifndef USE_ATAN2_APPROX
18-
ret_ph = _mm512_floor_ph(val_ph); // svml instruction.
19-
#else
20-
ret_ph = _mm512_cvtepi16_ph(_mm512_cvt_roundph_epi16(val_ph, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
21-
#endif
22-
return ret_ph;
16+
__m512i ret_epi16;
17+
ret_epi16 = _mm512_cvt_roundph_epi16(val_ph, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
18+
return ret_epi16;
2319
}
2420

25-
inline __m128h floor_ph_128(__m128h val_ph)
21+
inline __m128i floor_ph_128(__m128h val_ph)
2622
{
27-
__m128h ret_ph;
28-
ret_ph = _mm_cvtepi16_ph(_mm512_castph512_ph128(_mm512_cvt_roundph_epi16(_mm512_castph128_ph512(val_ph), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)));
29-
return ret_ph;
23+
__m128i ret_epi16;
24+
ret_epi16 = _mm512_castph512_ph128(_mm512_cvt_roundph_epi16(_mm512_castph128_ph512(val_ph), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
25+
return ret_epi16;
3026
}
3127

3228
inline void load3x3_ph(_Float16 *img, unsigned int width, unsigned int height, unsigned int stride, __m128h *out_8neighbors_ph, __m128h *out_center_ph)
@@ -310,7 +306,7 @@ void CTCountOfBitsChangedSegment_AVX512FP16_16f(_Float16 *LRImage, _Float16 *HRI
310306
__m512h val_ph = _mm512_add_ph( _mm512_mul_ph( weight_ph, center_LR_ph),
311307
_mm512_mul_ph(weight2_ph, center_HR_ph));
312308
val_ph = _mm512_add_ph( val_ph, _mm512_set1_ph(0.5));
313-
val_ph = floor_ph_512(val_ph);
309+
val_ph = _mm512_cvtepi16_ph(floor_ph_512(val_ph));
314310

315311
// convert (float)val to (epu8/16)val
316312
__m512i val_epu16 = _mm512_cvtph_epu16(val_ph), val_epu8, perm_epu;

0 commit comments

Comments
 (0)