added computeGTWG_Segment AVX2

cabirdme · xiaoxial · commit b74992f69415 · 2022-12-23T10:27:33.000+08:00
diff --git a/Library/Raisr.cpp b/Library/Raisr.cpp
@@ -1147,7 +1147,15 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
 #pragma unroll(unrollSizePatchBased / 2)
                 for (pix = 0; pix < unrollSizePatchBased / 2; pix++)
                 {
-                    computeGTWG_Segment_AVX512_32f(pSeg32f, rows, cols, rOffset, c + 2 * pix, &GTWG[2 * pix], &pixbuf[2 * pix][0], &pixbuf[2 * pix + 1][0]);
+                    if (gAsmType == AVX2)
+                        computeGTWG_Segment_AVX256_32f(pSeg32f, rows, cols, rOffset, c + 2 * pix, &GTWG[2 * pix], &pixbuf[2 * pix][0], &pixbuf[2 * pix + 1][0]);
+                    else if (gAsmType == AVX512)
+                        computeGTWG_Segment_AVX512_32f(pSeg32f, rows, cols, rOffset, c + 2 * pix, &GTWG[2 * pix], &pixbuf[2 * pix][0], &pixbuf[2 * pix + 1][0]);
+                    else
+                    {
+                        std::cout << "expected avx512 or avx2, but got " << gAsmType << std::endl;
+                        return RNLErrorBadParameter;
+                    }
                 }
 
                 GetHashValue_AVX256_32f(GTWG, passIdx, hashValue);
diff --git a/Library/Raisr_AVX256.cpp b/Library/Raisr_AVX256.cpp
@@ -58,6 +58,171 @@ inline float sumitup_ps_256(__m256 acc)
     return _mm_cvtss_f32(r1);
 }
 
+inline __m256 shiftL_AVX256(__m256 r)
+{
+    return _mm256_permutevar8x32_ps(r, _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 1));
+}
+
+inline __m256 shiftR_AVX256(__m256 r)
+{
+    return _mm256_permutevar8x32_ps(r, _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7));
+}
+
+inline __m256 GetGx_AVX256(__m256 r1, __m256 r3)
+{
+    return _mm256_sub_ps(r3, r1);
+}
+
+inline __m256 GetGy_AVX256(__m256 r2)
+{
+    return _mm256_sub_ps(shiftL_AVX256(r2), shiftR_AVX256(r2));
+}
+
+inline __m128 GetFirstHalf(__m256 n)
+{
+    return _mm256_extractf128_ps(n, 0);
+}
+
+inline __m128 GetLastHalf(__m256 n)
+{
+    return _mm256_extractf128_ps(n, 1);
+}
+
+template <int halfIndex>
+inline __m256 SetFirstVal(__m256 n, __m128 halfWithValue) {
+    __m128 newHalf = _mm_insert_ps(_mm256_extractf128_ps(n, 0), halfWithValue, halfIndex);
+    return _mm256_insertf128_ps(n, newHalf, 0);
+}
+
+template <int halfIndex>
+inline __m256 SetLastVal(__m256 n, __m128 halfWithValue) {
+    __m128 newHalf = _mm_insert_ps(_mm256_extractf128_ps(n, 1), halfWithValue, halfIndex);
+    return _mm256_insertf128_ps(n, newHalf, 1);
+}
+
+inline __m256 GetGy_AVX256Hi(__m256 xlo, __m256 xhi)
+{
+    // ideally we do some cross lane permute, but one doesnt seem to exist.  Our approach instead is to save the original values,
+    // do our in-lane permutes, then insert additional values on the ends to achieve correct behavior
+    __m128 xlohi = GetLastHalf(xlo);
+    __m128 xlolo = GetFirstHalf(xlo);
+
+    __m256 newloLeft = SetLastVal<0x30>(shiftL_AVX256(xhi), xlolo);
+    __m256 newloRight = SetFirstVal<0xC0>(shiftR_AVX256(xhi), xlohi);
+    __m256 ret = _mm256_sub_ps(newloLeft, newloRight);
+    return ret;
+}
+
+inline __m256 GetGy_AVX256Lo(__m256 xlo, __m256 xhi)
+{
+    // ideally we do some cross lane permute, but one doesnt seem to exist.  Our approach instead is to save the original values,
+    // do our in-lane permutes, then insert additional values on the ends to achieve correct behavior
+    __m128 xhilo = GetFirstHalf(xhi);
+    __m128 xhihi = GetLastHalf(xhi);
+    __m256 newloLeft = SetLastVal <0x30>(shiftL_AVX256(xlo), xhilo);
+    __m256 newloRight = SetFirstVal<0xC0>(shiftR_AVX256(xlo), xhihi);
+
+    __m256 ret = _mm256_sub_ps(newloLeft, newloRight);
+    return ret;
+}
+
+inline __m256 GetGTWG_AVX256(__m256 acc, __m256 a, __m256 w, __m256 b)
+{
+    return _mm256_fmadd_ps(_mm256_mul_ps(a, w), b, acc);
+}
+
+void inline computeGTWG_Segment_AVX256_32f(const float *img, const int nrows, const int ncols, const int r, const int col, float GTWG[][4], float *buf1, float *buf2)
+{
+    // offset is the starting position(top left) of the block which centered by (r, c)
+    int offset = (r - gLoopMargin) * ncols + col - gLoopMargin;
+    const float *p1 = img + offset;
+
+    __m256 gtwg0A1 = _mm256_setzero_ps(), gtwg0A2 = _mm256_setzero_ps();
+    __m256 gtwg0B1 = _mm256_setzero_ps(), gtwg0B2 = _mm256_setzero_ps();
+    __m256 gtwg1A1 = _mm256_setzero_ps(), gtwg1A2 = _mm256_setzero_ps();
+    __m256 gtwg1B1 = _mm256_setzero_ps(), gtwg1B2 = _mm256_setzero_ps();
+    __m256 gtwg3A1 = _mm256_setzero_ps(), gtwg3A2 = _mm256_setzero_ps();
+    __m256 gtwg3B1 = _mm256_setzero_ps(), gtwg3B2 = _mm256_setzero_ps();
+
+    // load 2 rows
+    __m256 a1 = _mm256_loadu_ps(p1);
+    __m256 a2 = _mm256_loadu_ps(p1+8);
+    p1 += ncols;
+    __m256 b1 = _mm256_loadu_ps(p1);
+    __m256 b2 = _mm256_loadu_ps(p1+8);
+#pragma unroll
+    for (int i = 0; i < gPatchSize; i++)
+    {
+        // process patchSize rows
+        // load next row
+        p1 += ncols;
+        __m256 c1 = _mm256_loadu_ps(p1);
+        __m256 c2 = _mm256_loadu_ps(p1+8);
+        __m256 w1, w2;
+       if(gBitDepth == 8) {
+            w1 = _mm256_loadu_ps(gGaussian2D8bit[i]);
+            w2 = _mm256_loadu_ps(gGaussian2D8bit[i]+8);
+       } else if (gBitDepth == 10) {
+            w1 = _mm256_loadu_ps(gGaussian2D10bit[i]);
+            w2 = _mm256_loadu_ps(gGaussian2D10bit[i]+8);
+       } else {
+            w1 = _mm256_loadu_ps(gGaussian2D16bit[i]);
+            w2 = _mm256_loadu_ps(gGaussian2D16bit[i]+8);
+       }
+
+        const __m256 gxi1 = GetGx_AVX256(a1, c1);
+        const __m256 gxi2 = GetGx_AVX256(a2, c2);
+
+        const __m256 gyi1 = GetGy_AVX256Lo(b1,b2);
+        const __m256 gyi2 = GetGy_AVX256Hi(b1,b2);
+
+        gtwg0A1 = GetGTWG_AVX256(gtwg0A1, gxi1, w1, gxi1);
+        gtwg0A2 = GetGTWG_AVX256(gtwg0A2, gxi2, w2, gxi2);
+        gtwg1A1 = GetGTWG_AVX256(gtwg1A1, gxi1, w1, gyi1);
+        gtwg1A2 = GetGTWG_AVX256(gtwg1A2, gxi2, w2, gyi2);
+        gtwg3A1 = GetGTWG_AVX256(gtwg3A1, gyi1, w1, gyi1);
+        gtwg3A2 = GetGTWG_AVX256(gtwg3A2, gyi2, w2, gyi2);
+
+        // Store last bit for shiftR and mask
+        __m128 xlohi = GetLastHalf(w1);
+        __m128 xhihi = GetLastHalf(w2);
+        w1 = SetFirstVal<0xC0>(shiftR_AVX256(w1), xhihi);
+        w2 = SetFirstVal<0xC0>(shiftR_AVX256(w2), xlohi);
+
+        gtwg0B1 = GetGTWG_AVX256(gtwg0B1, gxi1, w1, gxi1);
+        gtwg0B2 = GetGTWG_AVX256(gtwg0B2, gxi2, w2, gxi2);
+        gtwg1B1 = GetGTWG_AVX256(gtwg1B1, gxi1, w1, gyi1);
+        gtwg1B2 = GetGTWG_AVX256(gtwg1B2, gxi2, w2, gyi2);
+        gtwg3B1 = GetGTWG_AVX256(gtwg3B1, gyi1, w1, gyi1);
+        gtwg3B2 = GetGTWG_AVX256(gtwg3B2, gyi2, w2, gyi2);
+
+        // skip one, store next 11 bits.  The two masks are 0xfe, 0x0f
+        int lastbit = 0x80000000;
+        _mm256_maskstore_ps(buf1 + gPatchSize * i - 1, _mm256_setr_epi32(0, lastbit, lastbit, lastbit, lastbit, lastbit, lastbit, lastbit), b1);
+        _mm256_maskstore_ps(buf1 + gPatchSize * i - 1 + 8, _mm256_setr_epi32(lastbit, lastbit, lastbit, lastbit, 0,0,0,0), b2);
+        // skip two, store next 11 bits.  The two masks are 0xfc, 0x1f
+        _mm256_maskstore_ps(buf2 + gPatchSize * i - 2, _mm256_setr_epi32(0,0,lastbit,lastbit,lastbit,lastbit,lastbit,lastbit), b1);
+        _mm256_maskstore_ps(buf2 + gPatchSize * i - 2 + 8, _mm256_setr_epi32(lastbit,lastbit,lastbit,lastbit,lastbit,0,0,0), b2);
+        a1 = b1;
+        a2 = b2;
+        b1 = c1;
+        b2 = c2;
+    }
+
+    GTWG[0][0] = sumitup_ps_256(_mm256_add_ps(gtwg0A1, gtwg0A2));
+    GTWG[0][1] = sumitup_ps_256(_mm256_add_ps(gtwg1A1, gtwg1A2));
+    GTWG[0][3] = sumitup_ps_256(_mm256_add_ps(gtwg3A1, gtwg3A2));
+    GTWG[0][2] = GTWG[0][1];
+
+    GTWG[1][0] = sumitup_ps_256(_mm256_add_ps(gtwg0B1, gtwg0B2));
+    GTWG[1][1] = sumitup_ps_256(_mm256_add_ps(gtwg1B1, gtwg1B2));
+    GTWG[1][3] = sumitup_ps_256(_mm256_add_ps(gtwg3B1, gtwg3B2));
+    GTWG[1][2] = GTWG[1][1];
+
+    return;
+}
+
+
 // AVX2 version: for now, gPatchSize must be <= 16 because we can work with up to 16 float32s in two AVX256 registers.
 float inline DotProdPatch_AVX256_32f(const float *buf, const float *filter)
 {
diff --git a/Library/Raisr_AVX256.h b/Library/Raisr_AVX256.h
@@ -25,9 +25,6 @@ inline void load3x3_ps(float *img, unsigned int width, unsigned int height, unsi
     *out_8neighbors_ps = _mm256_insertf128_ps(_mm256_castps128_ps256(rowlo_f), rowhi_f, 1);
 }
 
-inline __m256i compare3x3_AVX256_32f(__m256 a, __m256 b, __m256i highbit_epi32);
-inline int sumitup_256_epi32(__m256i acc);
-inline float sumitup_ps_256(__m256 acc);
-
 int inline CTRandomness_AVX256_32f(float *inYUpscaled32f, int cols, int r, int c, int pix);
 float inline DotProdPatch_AVX256_32f(const float *buf, const float *filter);
+void inline computeGTWG_Segment_AVX256_32f(const float *img, const int nrows, const int ncols, const int r, const int col, float GTWG[][4], float *buf1, float *buf2);
diff --git a/Library/Raisr_AVX512.cpp b/Library/Raisr_AVX512.cpp
@@ -40,26 +40,26 @@ inline float sumitup_ps_512(__m512 acc)
     const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
     return _mm_cvtss_f32(r1);
 }
-inline __m512 shiftL(__m512 r)
+inline __m512 shiftL_AVX512(__m512 r)
 {
     return _mm512_permutexvar_ps(_mm512_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1), r);
 }
-inline __m512 shiftR(__m512 r)
+inline __m512 shiftR_AVX512(__m512 r)
 {
     return _mm512_permutexvar_ps(_mm512_set_epi32(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15), r);
 }
 
-inline __m512 GetGx(__m512 r1, __m512 r3)
+inline __m512 GetGx_AVX512(__m512 r1, __m512 r3)
 {
     return _mm512_sub_ps(r3, r1);
 }
 
-inline __m512 GetGy(__m512 r2)
+inline __m512 GetGy_AVX512(__m512 r2)
 {
-    return _mm512_sub_ps(shiftL(r2), shiftR(r2));
+    return _mm512_sub_ps(shiftL_AVX512(r2), shiftR_AVX512(r2));
 }
 
-inline __m512 GetGTWG(__m512 acc, __m512 a, __m512 w, __m512 b)
+inline __m512 GetGTWG_AVX512(__m512 acc, __m512 a, __m512 w, __m512 b)
 {
     return _mm512_fmadd_ps(_mm512_mul_ps(a, w), b, acc);
 }
@@ -80,9 +80,6 @@ void inline computeGTWG_Segment_AVX512_32f(const float *img, const int nrows, co
 #pragma unroll
     for (int i = 0; i < gPatchSize; i++)
     {
-        // memcpy(buf1+gPatchSize*i, p1+1, sizeof(float)*gPatchSize);
-        // memcpy(buf2+gPatchSize*i, p1+2, sizeof(float)*gPatchSize);
-
         // process patchSize rows
         // load next row
         p1 += ncols;
@@ -101,17 +98,17 @@ void inline computeGTWG_Segment_AVX512_32f(const float *img, const int nrows, co
             w = _mm512_loadu_ps(gGaussian2D16bit[i]);
         }
 
-        const __m512 gxi = GetGx(a, c);
-        const __m512 gyi = GetGy(b);
+        const __m512 gxi = GetGx_AVX512(a, c);
+        const __m512 gyi = GetGy_AVX512(b);
 
-        gtwg0A = GetGTWG(gtwg0A, gxi, w, gxi);
-        gtwg1A = GetGTWG(gtwg1A, gxi, w, gyi);
-        gtwg3A = GetGTWG(gtwg3A, gyi, w, gyi);
+        gtwg0A = GetGTWG_AVX512(gtwg0A, gxi, w, gxi);
+        gtwg1A = GetGTWG_AVX512(gtwg1A, gxi, w, gyi);
+        gtwg3A = GetGTWG_AVX512(gtwg3A, gyi, w, gyi);
 
-        w = shiftR(w);
-        gtwg0B = GetGTWG(gtwg0B, gxi, w, gxi);
-        gtwg1B = GetGTWG(gtwg1B, gxi, w, gyi);
-        gtwg3B = GetGTWG(gtwg3B, gyi, w, gyi);
+        w = shiftR_AVX512(w);
+        gtwg0B = GetGTWG_AVX512(gtwg0B, gxi, w, gxi);
+        gtwg1B = GetGTWG_AVX512(gtwg1B, gxi, w, gyi);
+        gtwg3B = GetGTWG_AVX512(gtwg3B, gyi, w, gyi);
 
         _mm512_mask_storeu_ps(buf1 + gPatchSize * i - 1, 0x0ffe, b);
         _mm512_mask_storeu_ps(buf2 + gPatchSize * i - 2, 0x1ffc, b);
diff --git a/Library/Raisr_AVX512.h b/Library/Raisr_AVX512.h
@@ -7,14 +7,6 @@
 #pragma once
 #include <immintrin.h>
 
-inline __mmask8 compare3x3_ps_AVX512(__m256 a, __m256 b);
-inline float sumitup_ps_512(__m512 acc);
-inline __m512 shiftL(__m512 r);
-inline __m512 shiftR(__m512 r);
-inline __m512 GetGx(__m512 r1, __m512 r3);
-inline __m512 GetGy(__m512 r2);
-inline __m512 GetGTWG(__m512 acc, __m512 a, __m512 w, __m512 b);
-
 void inline computeGTWG_Segment_AVX512_32f(const float *img, const int nrows, const int ncols, const int r, const int col, float GTWG[][4], float *buf1, float *buf2);
 int inline CTRandomness_AVX512_32f(float *inYUpscaled32f, int cols, int r, int c, int pix);
 float inline DotProdPatch_AVX512_32f(const float *buf, const float *filter);

Original file line number	Diff line number	Diff line change
`@@ -40,26 +40,26 @@ inline float sumitup_ps_512(__m512 acc)`
`40`	`40`	`const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));`
`41`	`41`	`return _mm_cvtss_f32(r1);`
`42`	`42`	`}`
`43`		`-inline __m512 shiftL(__m512 r)`
	`43`	`+inline __m512 shiftL_AVX512(__m512 r)`
`44`	`44`	`{`
`45`	`45`	`return _mm512_permutexvar_ps(_mm512_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1), r);`
`46`	`46`	`}`
`47`		`-inline __m512 shiftR(__m512 r)`
	`47`	`+inline __m512 shiftR_AVX512(__m512 r)`
`48`	`48`	`{`
`49`	`49`	`return _mm512_permutexvar_ps(_mm512_set_epi32(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15), r);`
`50`	`50`	`}`
`51`	`51`
`52`		`-inline __m512 GetGx(__m512 r1, __m512 r3)`
	`52`	`+inline __m512 GetGx_AVX512(__m512 r1, __m512 r3)`
`53`	`53`	`{`
`54`	`54`	`return _mm512_sub_ps(r3, r1);`
`55`	`55`	`}`
`56`	`56`
`57`		`-inline __m512 GetGy(__m512 r2)`
	`57`	`+inline __m512 GetGy_AVX512(__m512 r2)`
`58`	`58`	`{`
`59`		`- return _mm512_sub_ps(shiftL(r2), shiftR(r2));`
	`59`	`+ return _mm512_sub_ps(shiftL_AVX512(r2), shiftR_AVX512(r2));`
`60`	`60`	`}`
`61`	`61`
`62`		`-inline __m512 GetGTWG(__m512 acc, __m512 a, __m512 w, __m512 b)`
	`62`	`+inline __m512 GetGTWG_AVX512(__m512 acc, __m512 a, __m512 w, __m512 b)`
`63`	`63`	`{`
`64`	`64`	`return _mm512_fmadd_ps(_mm512_mul_ps(a, w), b, acc);`
`65`	`65`	`}`
`@@ -80,9 +80,6 @@ void inline computeGTWG_Segment_AVX512_32f(const float *img, const int nrows, co`
`80`	`80`	`#pragma unroll`
`81`	`81`	`for (int i = 0; i < gPatchSize; i++)`
`82`	`82`	`{`
`83`		`- // memcpy(buf1+gPatchSizei, p1+1, sizeof(float)gPatchSize);`
`84`		`- // memcpy(buf2+gPatchSizei, p1+2, sizeof(float)gPatchSize);`
`85`		`-`
`86`	`83`	`// process patchSize rows`
`87`	`84`	`// load next row`
`88`	`85`	`p1 += ncols;`
`@@ -101,17 +98,17 @@ void inline computeGTWG_Segment_AVX512_32f(const float *img, const int nrows, co`
`101`	`98`	`w = _mm512_loadu_ps(gGaussian2D16bit[i]);`
`102`	`99`	`}`
`103`	`100`
`104`		`- const __m512 gxi = GetGx(a, c);`
`105`		`- const __m512 gyi = GetGy(b);`
	`101`	`+ const __m512 gxi = GetGx_AVX512(a, c);`
	`102`	`+ const __m512 gyi = GetGy_AVX512(b);`
`106`	`103`
`107`		`- gtwg0A = GetGTWG(gtwg0A, gxi, w, gxi);`
`108`		`- gtwg1A = GetGTWG(gtwg1A, gxi, w, gyi);`
`109`		`- gtwg3A = GetGTWG(gtwg3A, gyi, w, gyi);`
	`104`	`+ gtwg0A = GetGTWG_AVX512(gtwg0A, gxi, w, gxi);`
	`105`	`+ gtwg1A = GetGTWG_AVX512(gtwg1A, gxi, w, gyi);`
	`106`	`+ gtwg3A = GetGTWG_AVX512(gtwg3A, gyi, w, gyi);`
`110`	`107`
`111`		`- w = shiftR(w);`
`112`		`- gtwg0B = GetGTWG(gtwg0B, gxi, w, gxi);`
`113`		`- gtwg1B = GetGTWG(gtwg1B, gxi, w, gyi);`
`114`		`- gtwg3B = GetGTWG(gtwg3B, gyi, w, gyi);`
	`108`	`+ w = shiftR_AVX512(w);`
	`109`	`+ gtwg0B = GetGTWG_AVX512(gtwg0B, gxi, w, gxi);`
	`110`	`+ gtwg1B = GetGTWG_AVX512(gtwg1B, gxi, w, gyi);`
	`111`	`+ gtwg3B = GetGTWG_AVX512(gtwg3B, gyi, w, gyi);`
`115`	`112`
`116`	`113`	`_mm512_mask_storeu_ps(buf1 + gPatchSize * i - 1, 0x0ffe, b);`
`117`	`114`	`_mm512_mask_storeu_ps(buf2 + gPatchSize * i - 2, 0x1ffc, b);`