|
11 | 11 | #include <cmath> |
12 | 12 | #include<string.h> |
13 | 13 |
|
14 | | -inline __m512h floor_ph_512(__m512h val_ph) |
| 14 | +inline __m512i floor_ph_512(__m512h val_ph) |
15 | 15 | { |
16 | | - __m512h ret_ph; |
17 | | -#ifndef USE_ATAN2_APPROX |
18 | | - ret_ph = _mm512_floor_ph(val_ph); // svml instruction. |
19 | | -#else |
20 | | - ret_ph = _mm512_cvtepi16_ph(_mm512_cvt_roundph_epi16(val_ph, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); |
21 | | -#endif |
22 | | - return ret_ph; |
| 16 | + __m512i ret_epi16; |
| 17 | + ret_epi16 = _mm512_cvt_roundph_epi16(val_ph, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); |
| 18 | + return ret_epi16; |
23 | 19 | } |
24 | 20 |
|
25 | | -inline __m128h floor_ph_128(__m128h val_ph) |
| 21 | +inline __m128i floor_ph_128(__m128h val_ph) |
26 | 22 | { |
27 | | - __m128h ret_ph; |
28 | | - ret_ph = _mm_cvtepi16_ph(_mm512_castph512_ph128(_mm512_cvt_roundph_epi16(_mm512_castph128_ph512(val_ph), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC))); |
29 | | - return ret_ph; |
| 23 | + __m128i ret_epi16; |
| 24 | + ret_epi16 = _mm512_castph512_ph128(_mm512_cvt_roundph_epi16(_mm512_castph128_ph512(val_ph), _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); |
| 25 | + return ret_epi16; |
30 | 26 | } |
31 | 27 |
|
32 | 28 | inline void load3x3_ph(_Float16 *img, unsigned int width, unsigned int height, unsigned int stride, __m128h *out_8neighbors_ph, __m128h *out_center_ph) |
@@ -310,7 +306,7 @@ void CTCountOfBitsChangedSegment_AVX512FP16_16f(_Float16 *LRImage, _Float16 *HRI |
310 | 306 | __m512h val_ph = _mm512_add_ph( _mm512_mul_ph( weight_ph, center_LR_ph), |
311 | 307 | _mm512_mul_ph(weight2_ph, center_HR_ph)); |
312 | 308 | val_ph = _mm512_add_ph( val_ph, _mm512_set1_ph(0.5)); |
313 | | - val_ph = floor_ph_512(val_ph); |
| 309 | + val_ph = _mm512_cvtepi16_ph(floor_ph_512(val_ph)); |
314 | 310 |
|
315 | 311 | // convert (float)val to (epu8/16)val |
316 | 312 | __m512i val_epu16 = _mm512_cvtph_epu16(val_ph), val_epu8, perm_epu; |
|
0 commit comments