added DotProdPatch AVX2

cabirdme · xiaoxial · commit fa902ae6abff · 2022-12-23T10:27:00.000+08:00
diff --git a/Library/Raisr.cpp b/Library/Raisr.cpp
@@ -1165,7 +1165,16 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
                 {
                     if (likely(c + pix < cols - gLoopMargin))
                     {
-                        float curPix = DotProdPatch_AVX512_32f(pixbuf[pix], fbase[pix]);
+                        float curPix;
+                        if (gAsmType == AVX2)
+                            curPix  = DotProdPatch_AVX256_32f(pixbuf[pix], fbase[pix]);
+                        else if (gAsmType == AVX512)
+                            curPix  = DotProdPatch_AVX512_32f(pixbuf[pix], fbase[pix]);
+                        else 
+                        {
+                            std::cout << "expected avx512 or avx2, but got " << gAsmType << std::endl;
+                            return RNLErrorBadParameter;
+                        }
                         if ((gBitDepth == 8 && curPix > gMin8bit && curPix < gMax8bit) ||
                             (gBitDepth != 8 && curPix > gMin16bit && curPix < gMax16bit))
                             pRaisr32f[rOffset * cols + c + pix] = curPix;
diff --git a/Library/Raisr_AVX256.cpp b/Library/Raisr_AVX256.cpp
@@ -57,3 +57,32 @@ inline float sumitup_ps_256(__m256 acc)
     const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
     return _mm_cvtss_f32(r1);
 }
+
+// AVX2 version: for now, gPatchSize must be <= 16 because we can work with up to 16 float32s in two AVX256 registers.
+float inline DotProdPatch_AVX256_32f(const float *buf, const float *filter)
+{
+    __m256 a1_ps = _mm256_load_ps(buf);
+    __m256 b1_ps = _mm256_load_ps(filter);
+    __m256 a2_ps = _mm256_load_ps(buf+8);
+    __m256 b2_ps = _mm256_load_ps(filter+8);
+
+    __m256 sum1 = _mm256_mul_ps(a1_ps, b1_ps);
+    __m256 sum2 = _mm256_mul_ps(a2_ps, b2_ps);
+
+#pragma unroll
+    for (int i = 1; i < 8; i++)
+    {
+        a1_ps = _mm256_load_ps(buf + i * 16);
+        a2_ps = _mm256_load_ps(buf + i * 16 + 8);
+        b1_ps = _mm256_load_ps(filter + i * 16);
+        b2_ps = _mm256_load_ps(filter + i * 16 + 8);
+
+        // compute dot prod using fmadd
+        sum1 = _mm256_fmadd_ps(a1_ps, b1_ps, sum1);
+        sum2 = _mm256_fmadd_ps(a2_ps, b2_ps, sum2);
+    }
+
+    // sumitup adds all 16 float values in sum(zmm) and returns a single float value
+    return  sumitup_ps_256(_mm256_add_ps(sum1, sum2));
+}
+
diff --git a/Library/Raisr_AVX256.h b/Library/Raisr_AVX256.h
@@ -25,8 +25,9 @@ inline void load3x3_ps(float *img, unsigned int width, unsigned int height, unsi
     *out_8neighbors_ps = _mm256_insertf128_ps(_mm256_castps128_ps256(rowlo_f), rowhi_f, 1);
 }
 
-
 inline __m256i compare3x3_ps(__m256 a, __m256 b, __m256i highbit_epi32);
 inline int sumitup_256_epi32(__m256i acc);
-int inline CTRandomness_AVX256_32f(float *inYUpscaled32f, int cols, int r, int c, int pix);
 inline float sumitup_ps_256(__m256 acc);
+
+int inline CTRandomness_AVX256_32f(float *inYUpscaled32f, int cols, int r, int c, int pix);
+float inline DotProdPatch_AVX256_32f(const float *buf, const float *filter);