added CTRandomness AVX2

cabirdme · xiaoxial · commit ea6e83b4f425 · 2022-12-23T10:27:16.000+08:00
diff --git a/Library/Raisr.cpp b/Library/Raisr.cpp
@@ -527,8 +527,8 @@ static void CTCountOfBitsChanged_AVX2(float *LRImage, float *HRImage, float *out
             int highbit = 0x80000000;
             const __m256i highbit_epi32 = _mm256_setr_epi32(highbit, highbit, highbit, highbit, highbit, highbit, highbit, highbit);
 
-            __m256i cmp_lr_epi32 = compare3x3_ps(row_lr_f, center_lr_f, highbit_epi32);
-            __m256i cmp_hr_epi32 = compare3x3_ps(row_hr_f, center_hr_f, highbit_epi32);
+            __m256i cmp_lr_epi32 = compare3x3_AVX256_32f(row_lr_f, center_lr_f, highbit_epi32);
+            __m256i cmp_hr_epi32 = compare3x3_AVX256_32f(row_hr_f, center_hr_f, highbit_epi32);
 
             // hammingDistance = abs( lr_cmp - hr_cmp )
             __m256i cmp_epi32 = _mm256_abs_epi32(_mm256_sub_epi32(cmp_lr_epi32, cmp_hr_epi32));
@@ -563,8 +563,8 @@ static void CTCountOfBitsChangedSegment_AVX2(float *LRImage, float *HRImage, con
             int highbit = 0x80000000;
             const __m256i highbit_epi32 = _mm256_setr_epi32(highbit, highbit, highbit, highbit, highbit, highbit, highbit, highbit);
 
-            __m256i cmp_lr_epi32 = compare3x3_ps(row_lr_f, center_lr_f, highbit_epi32);
-            __m256i cmp_hr_epi32 = compare3x3_ps(row_hr_f, center_hr_f, highbit_epi32);
+            __m256i cmp_lr_epi32 = compare3x3_AVX256_32f(row_lr_f, center_lr_f, highbit_epi32);
+            __m256i cmp_hr_epi32 = compare3x3_AVX256_32f(row_hr_f, center_hr_f, highbit_epi32);
 
             // hammingDistance = abs( lr_cmp - hr_cmp )
             __m256i cmp_epi32 = _mm256_abs_epi32(_mm256_sub_epi32(cmp_lr_epi32, cmp_hr_epi32));
@@ -1184,7 +1184,16 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
                         // CT-Blending, CTRandomness
                         if (blendingMode == Randomness)
                         {
-                            census = CTRandomness_AVX512_32f(pSeg32f, cols, rOffset, c, pix);
+                            if (gAsmType == AVX2)
+                                census = CTRandomness_AVX256_32f(pSeg32f, cols, rOffset, c, pix);
+                            else if (gAsmType == AVX512)
+                                census = CTRandomness_AVX512_32f(pSeg32f, cols, rOffset, c, pix);
+                            else
+                            {
+                                std::cout << "expected avx512 or avx2, but got " << gAsmType << std::endl;
+                                return RNLErrorBadParameter;
+                            }
+
                             float weight = (float)census / (float)CTnumberofPixel;
                             // position in the whole image: r * cols + c + pix
                             float val = weight * curPix + (1 - weight) * pSeg32f[rOffset * cols + c + pix];
diff --git a/Library/Raisr_AVX256.cpp b/Library/Raisr_AVX256.cpp
@@ -10,7 +10,7 @@
 
 
 
-inline __m256i compare3x3_ps(__m256 a, __m256 b, __m256i highbit_epi32)
+inline __m256i compare3x3_AVX256_32f(__m256 a, __m256 b, __m256i highbit_epi32)
 {
     // compare if neighbors < centerpixel, toggle bit in mask if true
     // when cmp_ps is true, it returns 0x7fffff (-nan).  When we convert that to int, it is 0x8000 0000
@@ -42,7 +42,7 @@ int inline CTRandomness_AVX256_32f(float *inYUpscaled32f, int cols, int r, int c
     int highbit = 0x80000000;
     const __m256i highbit_epi32 = _mm256_setr_epi32(highbit, highbit, highbit, highbit, highbit, highbit, highbit, highbit);
 
-    __m256i cmp_epi32 = compare3x3_ps(row_f, center_f, highbit_epi32);
+    __m256i cmp_epi32 = compare3x3_AVX256_32f(row_f, center_f, highbit_epi32);
 
     // count # of bits in mask
     census_count += sumitup_256_epi32(cmp_epi32);
diff --git a/Library/Raisr_AVX256.h b/Library/Raisr_AVX256.h
@@ -25,7 +25,7 @@ inline void load3x3_ps(float *img, unsigned int width, unsigned int height, unsi
     *out_8neighbors_ps = _mm256_insertf128_ps(_mm256_castps128_ps256(rowlo_f), rowhi_f, 1);
 }
 
-inline __m256i compare3x3_ps(__m256 a, __m256 b, __m256i highbit_epi32);
+inline __m256i compare3x3_AVX256_32f(__m256 a, __m256 b, __m256i highbit_epi32);
 inline int sumitup_256_epi32(__m256i acc);
 inline float sumitup_ps_256(__m256 acc);
 

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ inline void load3x3_ps(float *img, unsigned int width, unsigned int height, unsi`
`25`	`25`	`*out_8neighbors_ps = _mm256_insertf128_ps(_mm256_castps128_ps256(rowlo_f), rowhi_f, 1);`
`26`	`26`	`}`
`27`	`27`
`28`		`-inline __m256i compare3x3_ps(__m256 a, __m256 b, __m256i highbit_epi32);`
	`28`	`+inline __m256i compare3x3_AVX256_32f(__m256 a, __m256 b, __m256i highbit_epi32);`
`29`	`29`	`inline int sumitup_256_epi32(__m256i acc);`
`30`	`30`	`inline float sumitup_ps_256(__m256 acc);`
`31`	`31`