@@ -896,6 +896,134 @@ static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows
896896 }
897897}
898898
899+ // LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
900+ // rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
901+ // cols: stride for buffers in DT type.
902+ // outImageCols: stride for outImage buffer
903+ static void CTCountOfBitsChangedSegment_AVX256_32f (float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int , int > blendingZone, unsigned char *outImage, const int cols, const int outImageCols)
904+ {
905+ int rowStartOffset = blendingZone.first - startRow;
906+ int rowEndOffset = blendingZone.second - startRow;
907+
908+ const __m256 zero_ps = _mm256_setzero_ps ();
909+ const __m256 one_ps = _mm256_set1_ps (1.0 );
910+ const int cmp_le = _CMP_LT_OQ;
911+ const __m256i one_epi32 = _mm256_set1_epi32 (1 );
912+
913+ for (auto r = rowStartOffset; r < rowEndOffset; r++)
914+ {
915+ const int c_limit = (cols - CTmargin);
916+ int c_limit_avx = c_limit - (c_limit%8 )+1 ;
917+ for (auto c = CTmargin; c < c_limit_avx; c+=8 )
918+ {
919+ __m256i hammingDistance_epi32 = _mm256_setzero_si256 ();
920+
921+ __m256 center_LR_ps = _mm256_loadu_ps ( &LRImage[(r) * cols + c]);
922+ __m256 n1_LR_ps = _mm256_loadu_ps ( &LRImage[(r-1 ) * cols + (c-1 )]);
923+ __m256 n2_LR_ps = _mm256_loadu_ps ( &LRImage[(r-1 ) * cols + (c)]);
924+ __m256 n3_LR_ps = _mm256_loadu_ps ( &LRImage[(r-1 ) * cols + (c+1 )]);
925+ __m256 n4_LR_ps = _mm256_loadu_ps ( &LRImage[(r) * cols + (c-1 )]);
926+ __m256 n5_LR_ps = _mm256_loadu_ps ( &LRImage[(r) * cols + (c+1 )]);
927+ __m256 n6_LR_ps = _mm256_loadu_ps ( &LRImage[(r+1 ) * cols + (c-1 )]);
928+ __m256 n7_LR_ps = _mm256_loadu_ps ( &LRImage[(r+1 ) * cols + (c)]);
929+ __m256 n8_LR_ps = _mm256_loadu_ps ( &LRImage[(r+1 ) * cols + (c+1 )]);
930+
931+ __m256 center_HR_ps = _mm256_loadu_ps ( &HRImage[(r) * cols + c]);
932+ __m256 n1_HR_ps = _mm256_loadu_ps ( &HRImage[(r-1 ) * cols + (c-1 )]);
933+ __m256 n2_HR_ps = _mm256_loadu_ps ( &HRImage[(r-1 ) * cols + (c)]);
934+ __m256 n3_HR_ps = _mm256_loadu_ps ( &HRImage[(r-1 ) * cols + (c+1 )]);
935+ __m256 n4_HR_ps = _mm256_loadu_ps ( &HRImage[(r) * cols + (c-1 )]);
936+ __m256 n5_HR_ps = _mm256_loadu_ps ( &HRImage[(r) * cols + (c+1 )]);
937+ __m256 n6_HR_ps = _mm256_loadu_ps ( &HRImage[(r+1 ) * cols + (c-1 )]);
938+ __m256 n7_HR_ps = _mm256_loadu_ps ( &HRImage[(r+1 ) * cols + (c)]);
939+ __m256 n8_HR_ps = _mm256_loadu_ps ( &HRImage[(r+1 ) * cols + (c+1 )]);
940+
941+ hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
942+ _mm256_abs_epi32 (_mm256_sub_epi32 (
943+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n1_LR_ps, center_LR_ps, cmp_le))),
944+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n1_HR_ps, center_HR_ps, cmp_le))))));
945+ hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
946+ _mm256_abs_epi32 (_mm256_sub_epi32 (
947+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n2_LR_ps, center_LR_ps, cmp_le))),
948+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n2_HR_ps, center_HR_ps, cmp_le))))));
949+ hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
950+ _mm256_abs_epi32 (_mm256_sub_epi32 (
951+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n3_LR_ps, center_LR_ps, cmp_le))),
952+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n3_HR_ps, center_HR_ps, cmp_le))))));
953+ hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
954+ _mm256_abs_epi32 (_mm256_sub_epi32 (
955+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n4_LR_ps, center_LR_ps, cmp_le))),
956+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n4_HR_ps, center_HR_ps, cmp_le))))));
957+ hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
958+ _mm256_abs_epi32 (_mm256_sub_epi32 (
959+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n5_LR_ps, center_LR_ps, cmp_le))),
960+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n5_HR_ps, center_HR_ps, cmp_le))))));
961+ hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
962+ _mm256_abs_epi32 (_mm256_sub_epi32 (
963+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n6_LR_ps, center_LR_ps, cmp_le))),
964+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n6_HR_ps, center_HR_ps, cmp_le))))));
965+ hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
966+ _mm256_abs_epi32 (_mm256_sub_epi32 (
967+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n7_LR_ps, center_LR_ps, cmp_le))),
968+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n7_HR_ps, center_HR_ps, cmp_le))))));
969+ hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
970+ _mm256_abs_epi32 (_mm256_sub_epi32 (
971+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n8_LR_ps, center_LR_ps, cmp_le))),
972+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n8_HR_ps, center_HR_ps, cmp_le))))));
973+
974+ __m256 weight_ps = _mm256_div_ps ( _mm256_cvtepi32_ps (hammingDistance_epi32), _mm256_set1_ps ((float ) CTnumberofPixel) );
975+ __m256 weight2_ps = _mm256_sub_ps (one_ps, weight_ps);
976+ __m256 val_ps = _mm256_add_ps ( _mm256_mul_ps ( weight_ps, center_LR_ps),
977+ _mm256_mul_ps (weight2_ps, center_HR_ps));
978+ val_ps = _mm256_add_ps ( val_ps, _mm256_set1_ps (0.5 ));
979+ __m256i val_epi32 = _mm256_cvtps_epi32 (_mm256_floor_ps (val_ps)), val_epi16, val_epu8, val_epu16, perm_epu;
980+ int64_t val_epu8_64_t ;
981+ if (gBitDepth == 8 ) {
982+ val_epi32 = _mm256_max_epi32 (_mm256_min_epi32 ( val_epi32, _mm256_set1_epi32 (gMax8bit )), _mm256_set1_epi32 (gMin8bit ));
983+ val_epi16 = _mm256_packs_epi32 (val_epi32,val_epi32);
984+ val_epu8 = _mm256_packus_epi16 (val_epi16, val_epi16);
985+ perm_epu = _mm256_permutevar8x32_epi32 (val_epu8, _mm256_setr_epi32 (0 ,4 ,0 ,4 ,0 ,4 ,0 ,4 ));
986+ val_epu8_64_t = (_mm_cvtsi128_si64 (_mm256_extractf128_si256 (perm_epu, 0 )));
987+ memcpy ((void *) &outImage[(startRow + r) * outImageCols + c], (void *) &val_epu8_64_t , 8 );
988+ }
989+ else {
990+ val_epi32 = _mm256_max_epi32 (_mm256_min_epi32 ( val_epi32, _mm256_set1_epi32 (gMax16bit )), _mm256_set1_epi32 (gMin16bit ));
991+ val_epu16 = _mm256_packus_epi32 (val_epi32,val_epi32);
992+ perm_epu = _mm256_permute4x64_epi64 (val_epu16, 0x88 );
993+ unsigned short *out = (unsigned short *)outImage;
994+ _mm_storeu_si128 ((__m128i *) &out[(startRow + r) * outImageCols + c], _mm256_extractf128_si256 (perm_epu, 0 ));
995+ }
996+ }
997+
998+ for (auto c = c_limit_avx; c < c_limit; c++) // handle edge, too small for SIMD
999+ {
1000+ int hammingDistance = 0 ;
1001+
1002+ // Census transform
1003+ for (int i = -CTmargin; i <= CTmargin; i++)
1004+ {
1005+ for (int j = -CTmargin; j <= CTmargin; j++)
1006+ {
1007+ if (unlikely (i == 0 && j == 0 ))
1008+ continue ;
1009+ hammingDistance += std::abs ((LRImage[(r + i) * cols + (c + j)] < LRImage[r * cols + c] ? 1 : 0 ) - (HRImage[(r + i) * cols + (c + j)] < HRImage[r * cols + c] ? 1 : 0 ));
1010+ }
1011+ }
1012+ float weight = (float )hammingDistance / (float )CTnumberofPixel;
1013+ float val = weight * LRImage[r * cols + c] + (1 - weight) * HRImage[r * cols + c];
1014+ val += 0.5 ; // to round the value
1015+ // convert 32f to 8bit/10bit
1016+ if (gBitDepth == 8 ) {
1017+ outImage[(startRow + r) * outImageCols + c] = (unsigned char )(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
1018+ }
1019+ else {
1020+ unsigned short *out = (unsigned short *)outImage;
1021+ out[(startRow + r) * outImageCols + c] = (unsigned short )(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
1022+ }
1023+ }
1024+ }
1025+ }
1026+
8991027int inline CTRandomness_C (float *inYUpscaled32f, int cols, int r, int c, int pix)
9001028{
9011029 // Census transform
@@ -980,6 +1108,113 @@ inline float atan2Approximation(float y, float x)
9801108}
9811109#endif
9821110
1111+ inline __m256 atan2Approximation_AVX256_32f (__m256 y_ps, __m256 x_ps)
1112+ {
1113+ const float ONEQTR_PI = M_PI / 4.0 ;
1114+ const float THRQTR_PI = 3.0 * M_PI / 4.0 ;
1115+ const __m256 zero_ps = _mm256_set1_ps (0.0 );
1116+ const __m256 oneqtr_pi_ps = _mm256_set1_ps (ONEQTR_PI);
1117+ const __m256 thrqtr_pi_ps = _mm256_set1_ps (THRQTR_PI);
1118+
1119+ __m256 abs_y_ps = _mm256_add_ps ( _mm256_andnot_ps ( _mm256_set1_ps (-0 .0f ), y_ps),
1120+ _mm256_set1_ps (1e-10f ));
1121+
1122+ __m256 r_cond1_ps = _mm256_div_ps ( _mm256_add_ps (x_ps, abs_y_ps), _mm256_sub_ps (abs_y_ps, x_ps));
1123+ __m256 r_cond2_ps = _mm256_div_ps ( _mm256_sub_ps (x_ps, abs_y_ps), _mm256_add_ps (x_ps, abs_y_ps));
1124+ __m256 r_cmp_ps = _mm256_cmp_ps (x_ps, zero_ps, _CMP_LT_OQ);
1125+ __m256 r_ps = _mm256_blendv_ps ( r_cond2_ps, r_cond1_ps, r_cmp_ps);
1126+ __m256 angle_ps = _mm256_blendv_ps ( oneqtr_pi_ps, thrqtr_pi_ps, r_cmp_ps );
1127+
1128+ angle_ps = _mm256_fmadd_ps (_mm256_fmadd_ps (_mm256_mul_ps (_mm256_set1_ps (0 .1963f ), r_ps),
1129+ r_ps, _mm256_set1_ps (-0 .9817f )),
1130+ r_ps, angle_ps);
1131+
1132+ __m256 neg_angle_ps = _mm256_mul_ps (_mm256_set1_ps (-1 ), angle_ps);
1133+ return _mm256_blendv_ps ( angle_ps, neg_angle_ps, _mm256_cmp_ps (y_ps, zero_ps, _CMP_LT_OQ));
1134+ }
1135+
1136+ void inline GetHashValue_AVX256_32f (float GTWG[8 ][4 ], int passIdx, int32_t *idx) {
1137+ const float one = 1.0 ;
1138+ const float two = 2.0 ;
1139+ const float four = 4.0 ;
1140+ const float pi = PI;
1141+ const float near_zero = 0.00000000000000001 ;
1142+ const __m256 zero_ps = _mm256_setzero_ps ();
1143+ const __m256i zero_epi32 = _mm256_setzero_si256 ();
1144+ const __m256i one_epi32 = _mm256_set1_epi32 (1 );
1145+ const __m256i two_epi32 = _mm256_set1_epi32 (2 );
1146+
1147+ const int cmp_le = _CMP_LE_OQ;
1148+ const int cmp_gt = _CMP_GT_OQ;
1149+
1150+ __m256 m_a_ps = _mm256_setr_ps (GTWG[0 ][0 ], GTWG[1 ][0 ], GTWG[2 ][0 ], GTWG[3 ][0 ],
1151+ GTWG[4 ][0 ], GTWG[5 ][0 ], GTWG[6 ][0 ], GTWG[7 ][0 ]);
1152+ __m256 m_b_ps = _mm256_setr_ps (GTWG[0 ][1 ], GTWG[1 ][1 ], GTWG[2 ][1 ], GTWG[3 ][1 ],
1153+ GTWG[4 ][1 ], GTWG[5 ][1 ], GTWG[6 ][1 ], GTWG[7 ][1 ]);
1154+ __m256 m_d_ps = _mm256_setr_ps (GTWG[0 ][3 ], GTWG[1 ][3 ], GTWG[2 ][3 ], GTWG[3 ][3 ],
1155+ GTWG[4 ][3 ], GTWG[5 ][3 ], GTWG[6 ][3 ], GTWG[7 ][3 ]);
1156+ __m256 T_ps = _mm256_add_ps (m_a_ps, m_d_ps);
1157+ __m256 D_ps = _mm256_sub_ps ( _mm256_mul_ps ( m_a_ps, m_d_ps),
1158+ _mm256_mul_ps ( m_b_ps, m_b_ps));
1159+
1160+ __m256 sqr_ps = _mm256_sqrt_ps ( _mm256_sub_ps ( _mm256_div_ps ( _mm256_mul_ps (T_ps, T_ps),
1161+ _mm256_broadcast_ss (&four)), D_ps));
1162+
1163+ __m256 half_T_ps = _mm256_div_ps ( T_ps, _mm256_broadcast_ss (&two) );
1164+ __m256 L1_ps = _mm256_add_ps ( half_T_ps, sqr_ps);
1165+ __m256 L2_ps = _mm256_sub_ps ( half_T_ps, sqr_ps);
1166+
1167+ __m256 angle_ps = zero_ps;
1168+
1169+ __m256 blend_ps = _mm256_blendv_ps ( _mm256_broadcast_ss (&one), _mm256_sub_ps (L1_ps, m_d_ps),
1170+ _mm256_cmp_ps (m_b_ps, zero_ps, _CMP_NEQ_OQ) );
1171+
1172+ #ifdef USE_ATAN2_APPROX
1173+ angle_ps = atan2Approximation_AVX256_32f ( m_b_ps, blend_ps);
1174+ #else
1175+ angle_ps = _mm256_atan2_ps ( m_b_ps, blend_ps);
1176+ #endif
1177+
1178+ angle_ps = _mm256_add_ps ( angle_ps, _mm256_blendv_ps ( zero_ps, _mm256_broadcast_ss (&pi),
1179+ _mm256_cmp_ps (angle_ps, zero_ps, _CMP_LT_OQ) ) );
1180+
1181+ __m256 sqrtL1_ps = _mm256_sqrt_ps ( L1_ps );
1182+ __m256 sqrtL2_ps = _mm256_sqrt_ps ( L2_ps );
1183+ __m256 coherence_ps = _mm256_div_ps ( _mm256_sub_ps ( sqrtL1_ps, sqrtL2_ps ),
1184+ _mm256_add_ps ( _mm256_add_ps (sqrtL1_ps, sqrtL2_ps), _mm256_broadcast_ss (&near_zero) ) );
1185+ __m256 strength_ps = L1_ps;
1186+
1187+ __m256i angleIdx_epi32 = _mm256_cvtps_epi32 ( _mm256_floor_ps (_mm256_mul_ps (angle_ps, _mm256_broadcast_ss (&gQAngle ))));
1188+
1189+ angleIdx_epi32 = _mm256_min_epi32 ( _mm256_sub_epi32 ( _mm256_set1_epi32 (gQuantizationAngle ), _mm256_set1_epi32 (1 )),
1190+ _mm256_max_epi32 (angleIdx_epi32, zero_epi32 ) );
1191+
1192+ // AFAIK, today QStr & QCoh are vectors of size 2. I think searchsorted can return an index of 0,1, or 2
1193+ float *gQStr_data , *gQCoh_data ;
1194+ if (passIdx == 0 ) gQStr_data = gQStr .data (); else gQStr_data = gQStr2 .data ();
1195+ if (passIdx == 0 ) gQCoh_data = gQCoh .data (); else gQCoh_data = gQCoh2 .data ();
1196+ __m256 gQStr1_ps = _mm256_broadcast_ss (gQStr_data );
1197+ __m256 gQStr2_ps = _mm256_broadcast_ss (gQStr_data + 1 );
1198+ __m256 gQCoh1_ps = _mm256_broadcast_ss (gQCoh_data );
1199+ __m256 gQCoh2_ps = _mm256_broadcast_ss (gQCoh_data + 1 );
1200+
1201+ __m256i strengthIdx_epi32 = _mm256_sub_epi32 (two_epi32,
1202+ _mm256_add_epi32 (
1203+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 ( _mm256_cmp_ps (strength_ps, gQStr1_ps , cmp_le))),
1204+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (strength_ps, gQStr2_ps , cmp_le)))));
1205+ __m256i coherenceIdx_epi32 = _mm256_sub_epi32 (two_epi32,
1206+ _mm256_add_epi32 (
1207+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (coherence_ps, gQCoh1_ps , cmp_le))),
1208+ _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (coherence_ps, gQCoh2_ps , cmp_le)))));
1209+
1210+ const __m256i gQuantizationCoherence_epi32 = _mm256_set1_epi32 (gQuantizationCoherence );
1211+ __m256i idx_epi32 = _mm256_mullo_epi32 (gQuantizationCoherence_epi32 ,
1212+ _mm256_mullo_epi32 (angleIdx_epi32, _mm256_set1_epi32 (gQuantizationStrength )));
1213+ idx_epi32 = _mm256_add_epi32 (coherenceIdx_epi32,
1214+ _mm256_add_epi32 (idx_epi32, _mm256_mullo_epi32 (strengthIdx_epi32, gQuantizationCoherence_epi32 )));
1215+ _mm256_storeu_si256 ((__m256i *)idx, idx_epi32);
1216+ }
1217+
9831218inline int int_floor (float x)
9841219{
9851220 int i = (int )x; /* truncate */
@@ -1345,10 +1580,10 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
13451580 computeGTWG_Segment (pSeg32f, rows, cols, rOffset, c + 2 * pix, >WG[2 * pix], &pixbuf[2 * pix][0 ], &pixbuf[2 * pix + 1 ][0 ]);
13461581 }
13471582
1348- #pragma unroll(unrollSizePatchBased)
1583+ GetHashValue_AVX256_32f (GTWG, passIdx, hashValue);
1584+
13491585 for (pix = 0 ; pix < unrollSizePatchBased; pix++)
13501586 {
1351- hashValue[pix] = GetHashValue (GTWG[pix], passIdx);
13521587 if (passIdx == 0 )
13531588 fbase[pix] = gFilterBuckets [hashValue[pix]][pixelType[pix]];
13541589 else
@@ -1405,9 +1640,7 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
14051640 if (blendingMode == CountOfBitsChanged)
14061641 {
14071642 int segStart = gIppCtx .segZones [passIdx][threadIdx].scaleStartRow ;
1408- CTCountOfBitsChangedSegment<float >(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx .segZones [passIdx][threadIdx].blendingStartRow , gIppCtx .segZones [passIdx][threadIdx].blendingEndRow }, outY->pData , cols, outY->step );
1409- // No improve with AVX2
1410- // CTCountOfBitsChangedSegment_AVX2(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[threadIdx].blendingStartRow, gIppCtx.segZones[threadIdx].blendingEndRow}, outY->pData, cols, outY->step);
1643+ CTCountOfBitsChangedSegment_AVX256_32f (pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx .segZones [passIdx][threadIdx].blendingStartRow , gIppCtx .segZones [passIdx][threadIdx].blendingEndRow }, outY->pData , cols, outY->step );
14111644 }
14121645
14131646 threadStatus[threadIdx] = 1 ;
0 commit comments