added AVX2 functions CTCountOfBitsChangedSeg, GetHashValue, atanApproximation. If using Intel compiler, SVML uses atan. If using another compiler, atan approximation is used.

cabirdme · xiaoxial · commit 252d33d4528d · 2022-12-23T10:26:26.000+08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -30,6 +30,16 @@ set(flags_to_test
     -fPIC
 )
 
+# Raisr can use SVML for atan2 if Intel Compiler can be found, otherwise uses an atanApproximation
+get_filename_component(CC_FILENAME ${CMAKE_CXX_COMPILER} NAME)
+string(FIND "${CC_FILENAME}" "icp" CC_SUBSTR_INTEL)
+if (${CC_SUBSTR_INTEL} LESS 0)
+    message("Building with Atan Approximation")
+    list(APPEND flags_to_test "-DUSE_ATAN2_APPROX")
+else()
+    message("Building with Intel Compiler, using SVML")
+endif()
+
 foreach(cflag ${flags_to_test})
     string(REGEX REPLACE "[^A-Za-z0-9]" "_" cflag_var "${cflag}")
     set(test_cxx_flag "CXX_FLAG${cflag_var}")
diff --git a/Library/Raisr.cpp b/Library/Raisr.cpp
@@ -896,6 +896,134 @@ static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows
     }
 }
 
+// LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
+// rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
+// cols: stride for buffers in DT type.
+// outImageCols: stride for outImage buffer
+static void CTCountOfBitsChangedSegment_AVX256_32f(float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols, const int outImageCols)
+{
+    int rowStartOffset = blendingZone.first - startRow;
+    int rowEndOffset = blendingZone.second - startRow;
+
+    const __m256 zero_ps = _mm256_setzero_ps();
+    const __m256 one_ps = _mm256_set1_ps(1.0);
+    const int cmp_le = _CMP_LT_OQ;
+    const __m256i one_epi32 = _mm256_set1_epi32(1);
+
+    for (auto r = rowStartOffset; r < rowEndOffset; r++)
+    {
+        const int c_limit = (cols - CTmargin);
+        int c_limit_avx = c_limit - (c_limit%8)+1;
+        for (auto c = CTmargin; c < c_limit_avx; c+=8)
+        {
+            __m256i hammingDistance_epi32 = _mm256_setzero_si256();
+
+            __m256 center_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + c]);
+            __m256 n1_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c-1)]);
+            __m256 n2_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c)]);
+            __m256 n3_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c+1)]);
+            __m256 n4_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + (c-1)]);
+            __m256 n5_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + (c+1)]);
+            __m256 n6_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c-1)]);
+            __m256 n7_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c)]);
+            __m256 n8_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c+1)]);
+
+            __m256 center_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + c]);
+            __m256 n1_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c-1)]);
+            __m256 n2_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c)]);
+            __m256 n3_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c+1)]);
+            __m256 n4_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + (c-1)]);
+            __m256 n5_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + (c+1)]);
+            __m256 n6_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c-1)]);
+            __m256 n7_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c)]);
+            __m256 n8_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c+1)]);
+
+            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
+                                        _mm256_abs_epi32(_mm256_sub_epi32(
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n1_LR_ps, center_LR_ps, cmp_le))),
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n1_HR_ps, center_HR_ps, cmp_le))))));
+            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
+                                        _mm256_abs_epi32(_mm256_sub_epi32(
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n2_LR_ps, center_LR_ps, cmp_le))),
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n2_HR_ps, center_HR_ps, cmp_le))))));
+            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
+                                        _mm256_abs_epi32(_mm256_sub_epi32(
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n3_LR_ps, center_LR_ps, cmp_le))),
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n3_HR_ps, center_HR_ps, cmp_le))))));
+            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
+                                        _mm256_abs_epi32(_mm256_sub_epi32(
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n4_LR_ps, center_LR_ps, cmp_le))),
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n4_HR_ps, center_HR_ps, cmp_le))))));
+            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
+                                        _mm256_abs_epi32(_mm256_sub_epi32(
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n5_LR_ps, center_LR_ps, cmp_le))),
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n5_HR_ps, center_HR_ps, cmp_le))))));
+            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
+                                        _mm256_abs_epi32(_mm256_sub_epi32(
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n6_LR_ps, center_LR_ps, cmp_le))),
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n6_HR_ps, center_HR_ps, cmp_le))))));
+            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
+                                        _mm256_abs_epi32(_mm256_sub_epi32(
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n7_LR_ps, center_LR_ps, cmp_le))),
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n7_HR_ps, center_HR_ps, cmp_le))))));
+            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
+                                        _mm256_abs_epi32(_mm256_sub_epi32(
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n8_LR_ps, center_LR_ps, cmp_le))),
+                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n8_HR_ps, center_HR_ps, cmp_le))))));
+
+            __m256 weight_ps = _mm256_div_ps( _mm256_cvtepi32_ps(hammingDistance_epi32), _mm256_set1_ps((float) CTnumberofPixel) );
+            __m256 weight2_ps = _mm256_sub_ps(one_ps, weight_ps);
+            __m256 val_ps = _mm256_add_ps( _mm256_mul_ps( weight_ps, center_LR_ps),
+                                            _mm256_mul_ps(weight2_ps, center_HR_ps));
+            val_ps = _mm256_add_ps( val_ps, _mm256_set1_ps(0.5));
+            __m256i val_epi32 = _mm256_cvtps_epi32(_mm256_floor_ps(val_ps)), val_epi16, val_epu8, val_epu16, perm_epu;
+            int64_t val_epu8_64_t;
+            if (gBitDepth == 8) {
+                val_epi32 = _mm256_max_epi32(_mm256_min_epi32( val_epi32, _mm256_set1_epi32(gMax8bit)), _mm256_set1_epi32(gMin8bit));
+                val_epi16 = _mm256_packs_epi32(val_epi32,val_epi32);
+                val_epu8 = _mm256_packus_epi16(val_epi16, val_epi16);
+                perm_epu = _mm256_permutevar8x32_epi32(val_epu8, _mm256_setr_epi32(0,4,0,4,0,4,0,4));
+                val_epu8_64_t = (_mm_cvtsi128_si64(_mm256_extractf128_si256(perm_epu, 0)));
+                memcpy((void *) &outImage[(startRow + r) * outImageCols + c], (void *) &val_epu8_64_t, 8);
+            }
+            else {
+                val_epi32 = _mm256_max_epi32(_mm256_min_epi32( val_epi32, _mm256_set1_epi32(gMax16bit)), _mm256_set1_epi32(gMin16bit));
+                val_epu16 = _mm256_packus_epi32(val_epi32,val_epi32);
+                perm_epu = _mm256_permute4x64_epi64(val_epu16, 0x88);
+                unsigned short *out = (unsigned short *)outImage;
+                _mm_storeu_si128((__m128i *) &out[(startRow + r) * outImageCols + c], _mm256_extractf128_si256(perm_epu, 0));
+            }
+        }
+
+        for (auto c = c_limit_avx; c < c_limit; c++) // handle edge, too small for SIMD
+        {
+            int hammingDistance = 0;
+
+            // Census transform
+            for (int i = -CTmargin; i <= CTmargin; i++)
+            {
+                for (int j = -CTmargin; j <= CTmargin; j++)
+                {
+                    if (unlikely(i == 0 && j == 0))
+                        continue;
+                    hammingDistance += std::abs((LRImage[(r + i) * cols + (c + j)] < LRImage[r * cols + c] ? 1 : 0) - (HRImage[(r + i) * cols + (c + j)] < HRImage[r * cols + c] ? 1 : 0));
+                }
+            }
+            float weight = (float)hammingDistance / (float)CTnumberofPixel;
+            float val = weight * LRImage[r * cols + c] + (1 - weight) * HRImage[r * cols + c];
+            val += 0.5; // to round the value
+            //convert 32f to 8bit/10bit
+            if (gBitDepth == 8) {
+                outImage[(startRow + r) * outImageCols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
+            }
+            else {
+                unsigned short *out = (unsigned short *)outImage;
+                out[(startRow + r) * outImageCols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
+            }
+        }
+    }
+}
+
 int inline CTRandomness_C(float *inYUpscaled32f, int cols, int r, int c, int pix)
 {
     // Census transform
@@ -980,6 +1108,113 @@ inline float atan2Approximation(float y, float x)
 }
 #endif
 
+inline __m256 atan2Approximation_AVX256_32f(__m256 y_ps, __m256 x_ps)
+{
+    const float ONEQTR_PI = M_PI / 4.0;
+    const float THRQTR_PI = 3.0 * M_PI / 4.0;
+    const __m256 zero_ps = _mm256_set1_ps(0.0);
+    const __m256 oneqtr_pi_ps = _mm256_set1_ps(ONEQTR_PI);
+    const __m256 thrqtr_pi_ps = _mm256_set1_ps(THRQTR_PI);
+
+    __m256 abs_y_ps = _mm256_add_ps( _mm256_andnot_ps( _mm256_set1_ps(-0.0f), y_ps),
+                                     _mm256_set1_ps(1e-10f));
+
+    __m256 r_cond1_ps = _mm256_div_ps( _mm256_add_ps(x_ps, abs_y_ps), _mm256_sub_ps(abs_y_ps, x_ps));
+    __m256 r_cond2_ps = _mm256_div_ps( _mm256_sub_ps(x_ps, abs_y_ps), _mm256_add_ps(x_ps, abs_y_ps));
+    __m256 r_cmp_ps =  _mm256_cmp_ps(x_ps, zero_ps, _CMP_LT_OQ);
+    __m256 r_ps = _mm256_blendv_ps( r_cond2_ps, r_cond1_ps, r_cmp_ps);
+    __m256 angle_ps = _mm256_blendv_ps( oneqtr_pi_ps, thrqtr_pi_ps, r_cmp_ps );
+
+    angle_ps = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(0.1963f), r_ps),
+                                                                                    r_ps, _mm256_set1_ps(-0.9817f)),
+                                                                                    r_ps, angle_ps);
+
+    __m256 neg_angle_ps = _mm256_mul_ps(_mm256_set1_ps(-1), angle_ps);
+    return _mm256_blendv_ps( angle_ps, neg_angle_ps, _mm256_cmp_ps(y_ps, zero_ps, _CMP_LT_OQ));
+}
+
+void inline GetHashValue_AVX256_32f(float GTWG[8][4], int passIdx, int32_t *idx) {
+    const float one = 1.0;
+    const float two = 2.0;
+    const float four = 4.0;
+    const float pi = PI;
+    const float near_zero = 0.00000000000000001;
+    const __m256 zero_ps = _mm256_setzero_ps();
+    const __m256i zero_epi32 = _mm256_setzero_si256();
+    const __m256i one_epi32 = _mm256_set1_epi32(1);
+    const __m256i two_epi32 = _mm256_set1_epi32(2);
+
+    const int cmp_le = _CMP_LE_OQ;
+    const int cmp_gt = _CMP_GT_OQ;
+
+    __m256 m_a_ps = _mm256_setr_ps (GTWG[0][0], GTWG[1][0], GTWG[2][0], GTWG[3][0],
+                                   GTWG[4][0], GTWG[5][0], GTWG[6][0], GTWG[7][0]);
+    __m256 m_b_ps = _mm256_setr_ps (GTWG[0][1], GTWG[1][1], GTWG[2][1], GTWG[3][1],
+                                   GTWG[4][1], GTWG[5][1], GTWG[6][1], GTWG[7][1]);
+    __m256 m_d_ps = _mm256_setr_ps (GTWG[0][3], GTWG[1][3], GTWG[2][3], GTWG[3][3],
+                                   GTWG[4][3], GTWG[5][3], GTWG[6][3], GTWG[7][3]);
+    __m256 T_ps = _mm256_add_ps(m_a_ps, m_d_ps);
+    __m256 D_ps = _mm256_sub_ps( _mm256_mul_ps( m_a_ps, m_d_ps),
+                                _mm256_mul_ps( m_b_ps, m_b_ps));
+
+    __m256 sqr_ps = _mm256_sqrt_ps( _mm256_sub_ps( _mm256_div_ps ( _mm256_mul_ps(T_ps, T_ps),
+                                                                   _mm256_broadcast_ss(&four)), D_ps));
+
+    __m256 half_T_ps = _mm256_div_ps ( T_ps, _mm256_broadcast_ss(&two) );
+    __m256 L1_ps = _mm256_add_ps( half_T_ps, sqr_ps);
+    __m256 L2_ps = _mm256_sub_ps( half_T_ps, sqr_ps);
+
+    __m256 angle_ps = zero_ps;
+
+    __m256 blend_ps = _mm256_blendv_ps( _mm256_broadcast_ss(&one), _mm256_sub_ps(L1_ps, m_d_ps),
+                                    _mm256_cmp_ps(m_b_ps, zero_ps, _CMP_NEQ_OQ) );
+
+#ifdef USE_ATAN2_APPROX
+    angle_ps = atan2Approximation_AVX256_32f( m_b_ps, blend_ps);
+#else
+    angle_ps = _mm256_atan2_ps( m_b_ps, blend_ps);
+#endif
+
+    angle_ps = _mm256_add_ps ( angle_ps, _mm256_blendv_ps( zero_ps, _mm256_broadcast_ss(&pi),
+                                    _mm256_cmp_ps(angle_ps, zero_ps, _CMP_LT_OQ) ) );
+
+    __m256 sqrtL1_ps = _mm256_sqrt_ps( L1_ps );
+    __m256 sqrtL2_ps = _mm256_sqrt_ps( L2_ps );
+    __m256 coherence_ps = _mm256_div_ps( _mm256_sub_ps( sqrtL1_ps, sqrtL2_ps ),
+                                        _mm256_add_ps( _mm256_add_ps(sqrtL1_ps, sqrtL2_ps), _mm256_broadcast_ss(&near_zero) ) );
+    __m256 strength_ps = L1_ps;
+
+    __m256i angleIdx_epi32 = _mm256_cvtps_epi32( _mm256_floor_ps(_mm256_mul_ps (angle_ps, _mm256_broadcast_ss(&gQAngle))));
+
+    angleIdx_epi32 = _mm256_min_epi32( _mm256_sub_epi32( _mm256_set1_epi32(gQuantizationAngle), _mm256_set1_epi32(1)),
+                                       _mm256_max_epi32(angleIdx_epi32, zero_epi32 ) );
+
+   // AFAIK, today QStr & QCoh are vectors of size 2.  I think searchsorted can return an index of 0,1, or 2
+    float *gQStr_data, *gQCoh_data;
+    if (passIdx == 0) gQStr_data = gQStr.data(); else gQStr_data = gQStr2.data();
+    if (passIdx == 0) gQCoh_data = gQCoh.data(); else gQCoh_data = gQCoh2.data();
+    __m256 gQStr1_ps = _mm256_broadcast_ss(gQStr_data);
+    __m256 gQStr2_ps = _mm256_broadcast_ss(gQStr_data + 1);
+    __m256 gQCoh1_ps = _mm256_broadcast_ss(gQCoh_data);
+    __m256 gQCoh2_ps = _mm256_broadcast_ss(gQCoh_data + 1);
+
+   __m256i strengthIdx_epi32 = _mm256_sub_epi32(two_epi32,
+                                    _mm256_add_epi32(
+                                    _mm256_and_si256(one_epi32, _mm256_castps_si256( _mm256_cmp_ps(strength_ps, gQStr1_ps, cmp_le))),
+                                    _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(strength_ps, gQStr2_ps, cmp_le)))));
+    __m256i coherenceIdx_epi32 = _mm256_sub_epi32(two_epi32,
+                                    _mm256_add_epi32(
+                                    _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(coherence_ps, gQCoh1_ps, cmp_le))),
+                                    _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(coherence_ps, gQCoh2_ps, cmp_le)))));
+
+    const __m256i gQuantizationCoherence_epi32 = _mm256_set1_epi32(gQuantizationCoherence);
+    __m256i idx_epi32 = _mm256_mullo_epi32(gQuantizationCoherence_epi32,
+                                            _mm256_mullo_epi32(angleIdx_epi32, _mm256_set1_epi32(gQuantizationStrength)));
+    idx_epi32 = _mm256_add_epi32(coherenceIdx_epi32,
+                                _mm256_add_epi32(idx_epi32, _mm256_mullo_epi32(strengthIdx_epi32, gQuantizationCoherence_epi32)));
+    _mm256_storeu_si256((__m256i *)idx, idx_epi32);
+}
+
 inline int int_floor(float x)
 {
     int i = (int)x;     /* truncate */
@@ -1345,10 +1580,10 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
                     computeGTWG_Segment(pSeg32f, rows, cols, rOffset, c + 2 * pix, &GTWG[2 * pix], &pixbuf[2 * pix][0], &pixbuf[2 * pix + 1][0]);
                 }
 
-#pragma unroll(unrollSizePatchBased)
+                GetHashValue_AVX256_32f(GTWG, passIdx, hashValue);
+
                 for (pix = 0; pix < unrollSizePatchBased; pix++)
                 {
-                    hashValue[pix] = GetHashValue(GTWG[pix], passIdx);
                     if (passIdx == 0)
                         fbase[pix] = gFilterBuckets[hashValue[pix]][pixelType[pix]];
                     else
@@ -1405,9 +1640,7 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
         if (blendingMode == CountOfBitsChanged)
         {
             int segStart = gIppCtx.segZones[passIdx][threadIdx].scaleStartRow;
-            CTCountOfBitsChangedSegment<float>(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[passIdx][threadIdx].blendingStartRow, gIppCtx.segZones[passIdx][threadIdx].blendingEndRow}, outY->pData, cols, outY->step);
-            // No improve with AVX2
-            // CTCountOfBitsChangedSegment_AVX2(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[threadIdx].blendingStartRow, gIppCtx.segZones[threadIdx].blendingEndRow}, outY->pData, cols, outY->step);
+            CTCountOfBitsChangedSegment_AVX256_32f(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[passIdx][threadIdx].blendingStartRow, gIppCtx.segZones[passIdx][threadIdx].blendingEndRow}, outY->pData, cols, outY->step);
         }
 
         threadStatus[threadIdx] = 1;
diff --git a/build.sh b/build.sh
@@ -27,6 +27,17 @@ build() (
         *) break ;; 
         esac
     done
+    if $docker_flag; then
+    # docker builds don't support intel cc, so reassign compiler
+        if check_executable clang++; then
+            CXX=$(check_executable -p clang++)
+        elif check_executable g++; then
+            CXX=$(check_executable -p g++)
+        else
+            die "No suitable cpp compiler found in path" \
+            "Please either install one or set it via cxx=*"
+        fi
+    fi
     echo "Create folder: build, build type: $build_type"
     mkdir -p build > /dev/null 2>&1
     cd_safe build
@@ -74,11 +85,10 @@ check_executable() (
     done
     return 127
 )
-
-if check_executable clang++; then
+if check_executable icpx; then
+    CXX=$(check_executable -p icpx)
+elif check_executable clang++; then
     CXX=$(check_executable -p clang++)
-elif check_executable icpc "/opt/intel/bin"; then
-    CXX=$(check_executable -p icpc "/opt/intel/bin")
 elif check_executable g++; then
     CXX=$(check_executable -p g++)
 else