@@ -676,134 +676,6 @@ static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows
676676 }
677677}
678678
679- // LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
680- // rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
681- // cols: stride for buffers in DT type.
682- // outImageCols: stride for outImage buffer
683- static void CTCountOfBitsChangedSegment_AVX256_32f (float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int , int > blendingZone, unsigned char *outImage, const int cols, const int outImageCols)
684- {
685- int rowStartOffset = blendingZone.first - startRow;
686- int rowEndOffset = blendingZone.second - startRow;
687-
688- const __m256 zero_ps = _mm256_setzero_ps ();
689- const __m256 one_ps = _mm256_set1_ps (1.0 );
690- const int cmp_le = _CMP_LT_OQ;
691- const __m256i one_epi32 = _mm256_set1_epi32 (1 );
692-
693- for (auto r = rowStartOffset; r < rowEndOffset; r++)
694- {
695- const int c_limit = (cols - CTmargin);
696- int c_limit_avx = c_limit - (c_limit%8 )+1 ;
697- for (auto c = CTmargin; c < c_limit_avx; c+=8 )
698- {
699- __m256i hammingDistance_epi32 = _mm256_setzero_si256 ();
700-
701- __m256 center_LR_ps = _mm256_loadu_ps ( &LRImage[(r) * cols + c]);
702- __m256 n1_LR_ps = _mm256_loadu_ps ( &LRImage[(r-1 ) * cols + (c-1 )]);
703- __m256 n2_LR_ps = _mm256_loadu_ps ( &LRImage[(r-1 ) * cols + (c)]);
704- __m256 n3_LR_ps = _mm256_loadu_ps ( &LRImage[(r-1 ) * cols + (c+1 )]);
705- __m256 n4_LR_ps = _mm256_loadu_ps ( &LRImage[(r) * cols + (c-1 )]);
706- __m256 n5_LR_ps = _mm256_loadu_ps ( &LRImage[(r) * cols + (c+1 )]);
707- __m256 n6_LR_ps = _mm256_loadu_ps ( &LRImage[(r+1 ) * cols + (c-1 )]);
708- __m256 n7_LR_ps = _mm256_loadu_ps ( &LRImage[(r+1 ) * cols + (c)]);
709- __m256 n8_LR_ps = _mm256_loadu_ps ( &LRImage[(r+1 ) * cols + (c+1 )]);
710-
711- __m256 center_HR_ps = _mm256_loadu_ps ( &HRImage[(r) * cols + c]);
712- __m256 n1_HR_ps = _mm256_loadu_ps ( &HRImage[(r-1 ) * cols + (c-1 )]);
713- __m256 n2_HR_ps = _mm256_loadu_ps ( &HRImage[(r-1 ) * cols + (c)]);
714- __m256 n3_HR_ps = _mm256_loadu_ps ( &HRImage[(r-1 ) * cols + (c+1 )]);
715- __m256 n4_HR_ps = _mm256_loadu_ps ( &HRImage[(r) * cols + (c-1 )]);
716- __m256 n5_HR_ps = _mm256_loadu_ps ( &HRImage[(r) * cols + (c+1 )]);
717- __m256 n6_HR_ps = _mm256_loadu_ps ( &HRImage[(r+1 ) * cols + (c-1 )]);
718- __m256 n7_HR_ps = _mm256_loadu_ps ( &HRImage[(r+1 ) * cols + (c)]);
719- __m256 n8_HR_ps = _mm256_loadu_ps ( &HRImage[(r+1 ) * cols + (c+1 )]);
720-
721- hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
722- _mm256_abs_epi32 (_mm256_sub_epi32 (
723- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n1_LR_ps, center_LR_ps, cmp_le))),
724- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n1_HR_ps, center_HR_ps, cmp_le))))));
725- hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
726- _mm256_abs_epi32 (_mm256_sub_epi32 (
727- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n2_LR_ps, center_LR_ps, cmp_le))),
728- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n2_HR_ps, center_HR_ps, cmp_le))))));
729- hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
730- _mm256_abs_epi32 (_mm256_sub_epi32 (
731- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n3_LR_ps, center_LR_ps, cmp_le))),
732- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n3_HR_ps, center_HR_ps, cmp_le))))));
733- hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
734- _mm256_abs_epi32 (_mm256_sub_epi32 (
735- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n4_LR_ps, center_LR_ps, cmp_le))),
736- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n4_HR_ps, center_HR_ps, cmp_le))))));
737- hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
738- _mm256_abs_epi32 (_mm256_sub_epi32 (
739- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n5_LR_ps, center_LR_ps, cmp_le))),
740- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n5_HR_ps, center_HR_ps, cmp_le))))));
741- hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
742- _mm256_abs_epi32 (_mm256_sub_epi32 (
743- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n6_LR_ps, center_LR_ps, cmp_le))),
744- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n6_HR_ps, center_HR_ps, cmp_le))))));
745- hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
746- _mm256_abs_epi32 (_mm256_sub_epi32 (
747- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n7_LR_ps, center_LR_ps, cmp_le))),
748- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n7_HR_ps, center_HR_ps, cmp_le))))));
749- hammingDistance_epi32 = _mm256_add_epi32 ( hammingDistance_epi32,
750- _mm256_abs_epi32 (_mm256_sub_epi32 (
751- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n8_LR_ps, center_LR_ps, cmp_le))),
752- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (n8_HR_ps, center_HR_ps, cmp_le))))));
753-
754- __m256 weight_ps = _mm256_div_ps ( _mm256_cvtepi32_ps (hammingDistance_epi32), _mm256_set1_ps ((float ) CTnumberofPixel) );
755- __m256 weight2_ps = _mm256_sub_ps (one_ps, weight_ps);
756- __m256 val_ps = _mm256_add_ps ( _mm256_mul_ps ( weight_ps, center_LR_ps),
757- _mm256_mul_ps (weight2_ps, center_HR_ps));
758- val_ps = _mm256_add_ps ( val_ps, _mm256_set1_ps (0.5 ));
759- __m256i val_epi32 = _mm256_cvtps_epi32 (_mm256_floor_ps (val_ps)), val_epi16, val_epu8, val_epu16, perm_epu;
760- int64_t val_epu8_64_t ;
761- if (gBitDepth == 8 ) {
762- val_epi32 = _mm256_max_epi32 (_mm256_min_epi32 ( val_epi32, _mm256_set1_epi32 (gMax8bit )), _mm256_set1_epi32 (gMin8bit ));
763- val_epi16 = _mm256_packs_epi32 (val_epi32,val_epi32);
764- val_epu8 = _mm256_packus_epi16 (val_epi16, val_epi16);
765- perm_epu = _mm256_permutevar8x32_epi32 (val_epu8, _mm256_setr_epi32 (0 ,4 ,0 ,4 ,0 ,4 ,0 ,4 ));
766- val_epu8_64_t = (_mm_cvtsi128_si64 (_mm256_extractf128_si256 (perm_epu, 0 )));
767- memcpy ((void *) &outImage[(startRow + r) * outImageCols + c], (void *) &val_epu8_64_t , 8 );
768- }
769- else {
770- val_epi32 = _mm256_max_epi32 (_mm256_min_epi32 ( val_epi32, _mm256_set1_epi32 (gMax16bit )), _mm256_set1_epi32 (gMin16bit ));
771- val_epu16 = _mm256_packus_epi32 (val_epi32,val_epi32);
772- perm_epu = _mm256_permute4x64_epi64 (val_epu16, 0x88 );
773- unsigned short *out = (unsigned short *)outImage;
774- _mm_storeu_si128 ((__m128i *) &out[(startRow + r) * outImageCols + c], _mm256_extractf128_si256 (perm_epu, 0 ));
775- }
776- }
777-
778- for (auto c = c_limit_avx; c < c_limit; c++) // handle edge, too small for SIMD
779- {
780- int hammingDistance = 0 ;
781-
782- // Census transform
783- for (int i = -CTmargin; i <= CTmargin; i++)
784- {
785- for (int j = -CTmargin; j <= CTmargin; j++)
786- {
787- if (unlikely (i == 0 && j == 0 ))
788- continue ;
789- hammingDistance += std::abs ((LRImage[(r + i) * cols + (c + j)] < LRImage[r * cols + c] ? 1 : 0 ) - (HRImage[(r + i) * cols + (c + j)] < HRImage[r * cols + c] ? 1 : 0 ));
790- }
791- }
792- float weight = (float )hammingDistance / (float )CTnumberofPixel;
793- float val = weight * LRImage[r * cols + c] + (1 - weight) * HRImage[r * cols + c];
794- val += 0.5 ; // to round the value
795- // convert 32f to 8bit/10bit
796- if (gBitDepth == 8 ) {
797- outImage[(startRow + r) * outImageCols + c] = (unsigned char )(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
798- }
799- else {
800- unsigned short *out = (unsigned short *)outImage;
801- out[(startRow + r) * outImageCols + c] = (unsigned short )(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
802- }
803- }
804- }
805- }
806-
807679int inline CTRandomness_C (float *inYUpscaled32f, int cols, int r, int c, int pix)
808680{
809681 // Census transform
@@ -849,113 +721,6 @@ inline float atan2Approximation(float y, float x)
849721}
850722#endif
851723
852- inline __m256 atan2Approximation_AVX256_32f (__m256 y_ps, __m256 x_ps)
853- {
854- const float ONEQTR_PI = M_PI / 4.0 ;
855- const float THRQTR_PI = 3.0 * M_PI / 4.0 ;
856- const __m256 zero_ps = _mm256_set1_ps (0.0 );
857- const __m256 oneqtr_pi_ps = _mm256_set1_ps (ONEQTR_PI);
858- const __m256 thrqtr_pi_ps = _mm256_set1_ps (THRQTR_PI);
859-
860- __m256 abs_y_ps = _mm256_add_ps ( _mm256_andnot_ps ( _mm256_set1_ps (-0 .0f ), y_ps),
861- _mm256_set1_ps (1e-10f ));
862-
863- __m256 r_cond1_ps = _mm256_div_ps ( _mm256_add_ps (x_ps, abs_y_ps), _mm256_sub_ps (abs_y_ps, x_ps));
864- __m256 r_cond2_ps = _mm256_div_ps ( _mm256_sub_ps (x_ps, abs_y_ps), _mm256_add_ps (x_ps, abs_y_ps));
865- __m256 r_cmp_ps = _mm256_cmp_ps (x_ps, zero_ps, _CMP_LT_OQ);
866- __m256 r_ps = _mm256_blendv_ps ( r_cond2_ps, r_cond1_ps, r_cmp_ps);
867- __m256 angle_ps = _mm256_blendv_ps ( oneqtr_pi_ps, thrqtr_pi_ps, r_cmp_ps );
868-
869- angle_ps = _mm256_fmadd_ps (_mm256_fmadd_ps (_mm256_mul_ps (_mm256_set1_ps (0 .1963f ), r_ps),
870- r_ps, _mm256_set1_ps (-0 .9817f )),
871- r_ps, angle_ps);
872-
873- __m256 neg_angle_ps = _mm256_mul_ps (_mm256_set1_ps (-1 ), angle_ps);
874- return _mm256_blendv_ps ( angle_ps, neg_angle_ps, _mm256_cmp_ps (y_ps, zero_ps, _CMP_LT_OQ));
875- }
876-
877- void inline GetHashValue_AVX256_32f (float GTWG[8 ][4 ], int passIdx, int32_t *idx) {
878- const float one = 1.0 ;
879- const float two = 2.0 ;
880- const float four = 4.0 ;
881- const float pi = PI;
882- const float near_zero = 0.00000000000000001 ;
883- const __m256 zero_ps = _mm256_setzero_ps ();
884- const __m256i zero_epi32 = _mm256_setzero_si256 ();
885- const __m256i one_epi32 = _mm256_set1_epi32 (1 );
886- const __m256i two_epi32 = _mm256_set1_epi32 (2 );
887-
888- const int cmp_le = _CMP_LE_OQ;
889- const int cmp_gt = _CMP_GT_OQ;
890-
891- __m256 m_a_ps = _mm256_setr_ps (GTWG[0 ][0 ], GTWG[1 ][0 ], GTWG[2 ][0 ], GTWG[3 ][0 ],
892- GTWG[4 ][0 ], GTWG[5 ][0 ], GTWG[6 ][0 ], GTWG[7 ][0 ]);
893- __m256 m_b_ps = _mm256_setr_ps (GTWG[0 ][1 ], GTWG[1 ][1 ], GTWG[2 ][1 ], GTWG[3 ][1 ],
894- GTWG[4 ][1 ], GTWG[5 ][1 ], GTWG[6 ][1 ], GTWG[7 ][1 ]);
895- __m256 m_d_ps = _mm256_setr_ps (GTWG[0 ][3 ], GTWG[1 ][3 ], GTWG[2 ][3 ], GTWG[3 ][3 ],
896- GTWG[4 ][3 ], GTWG[5 ][3 ], GTWG[6 ][3 ], GTWG[7 ][3 ]);
897- __m256 T_ps = _mm256_add_ps (m_a_ps, m_d_ps);
898- __m256 D_ps = _mm256_sub_ps ( _mm256_mul_ps ( m_a_ps, m_d_ps),
899- _mm256_mul_ps ( m_b_ps, m_b_ps));
900-
901- __m256 sqr_ps = _mm256_sqrt_ps ( _mm256_sub_ps ( _mm256_div_ps ( _mm256_mul_ps (T_ps, T_ps),
902- _mm256_broadcast_ss (&four)), D_ps));
903-
904- __m256 half_T_ps = _mm256_div_ps ( T_ps, _mm256_broadcast_ss (&two) );
905- __m256 L1_ps = _mm256_add_ps ( half_T_ps, sqr_ps);
906- __m256 L2_ps = _mm256_sub_ps ( half_T_ps, sqr_ps);
907-
908- __m256 angle_ps = zero_ps;
909-
910- __m256 blend_ps = _mm256_blendv_ps ( _mm256_broadcast_ss (&one), _mm256_sub_ps (L1_ps, m_d_ps),
911- _mm256_cmp_ps (m_b_ps, zero_ps, _CMP_NEQ_OQ) );
912-
913- #ifdef USE_ATAN2_APPROX
914- angle_ps = atan2Approximation_AVX256_32f ( m_b_ps, blend_ps);
915- #else
916- angle_ps = _mm256_atan2_ps ( m_b_ps, blend_ps);
917- #endif
918-
919- angle_ps = _mm256_add_ps ( angle_ps, _mm256_blendv_ps ( zero_ps, _mm256_broadcast_ss (&pi),
920- _mm256_cmp_ps (angle_ps, zero_ps, _CMP_LT_OQ) ) );
921-
922- __m256 sqrtL1_ps = _mm256_sqrt_ps ( L1_ps );
923- __m256 sqrtL2_ps = _mm256_sqrt_ps ( L2_ps );
924- __m256 coherence_ps = _mm256_div_ps ( _mm256_sub_ps ( sqrtL1_ps, sqrtL2_ps ),
925- _mm256_add_ps ( _mm256_add_ps (sqrtL1_ps, sqrtL2_ps), _mm256_broadcast_ss (&near_zero) ) );
926- __m256 strength_ps = L1_ps;
927-
928- __m256i angleIdx_epi32 = _mm256_cvtps_epi32 ( _mm256_floor_ps (_mm256_mul_ps (angle_ps, _mm256_broadcast_ss (&gQAngle ))));
929-
930- angleIdx_epi32 = _mm256_min_epi32 ( _mm256_sub_epi32 ( _mm256_set1_epi32 (gQuantizationAngle ), _mm256_set1_epi32 (1 )),
931- _mm256_max_epi32 (angleIdx_epi32, zero_epi32 ) );
932-
933- // AFAIK, today QStr & QCoh are vectors of size 2. I think searchsorted can return an index of 0,1, or 2
934- float *gQStr_data , *gQCoh_data ;
935- if (passIdx == 0 ) gQStr_data = gQStr .data (); else gQStr_data = gQStr2 .data ();
936- if (passIdx == 0 ) gQCoh_data = gQCoh .data (); else gQCoh_data = gQCoh2 .data ();
937- __m256 gQStr1_ps = _mm256_broadcast_ss (gQStr_data );
938- __m256 gQStr2_ps = _mm256_broadcast_ss (gQStr_data + 1 );
939- __m256 gQCoh1_ps = _mm256_broadcast_ss (gQCoh_data );
940- __m256 gQCoh2_ps = _mm256_broadcast_ss (gQCoh_data + 1 );
941-
942- __m256i strengthIdx_epi32 = _mm256_sub_epi32 (two_epi32,
943- _mm256_add_epi32 (
944- _mm256_and_si256 (one_epi32, _mm256_castps_si256 ( _mm256_cmp_ps (strength_ps, gQStr1_ps , cmp_le))),
945- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (strength_ps, gQStr2_ps , cmp_le)))));
946- __m256i coherenceIdx_epi32 = _mm256_sub_epi32 (two_epi32,
947- _mm256_add_epi32 (
948- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (coherence_ps, gQCoh1_ps , cmp_le))),
949- _mm256_and_si256 (one_epi32, _mm256_castps_si256 (_mm256_cmp_ps (coherence_ps, gQCoh2_ps , cmp_le)))));
950-
951- const __m256i gQuantizationCoherence_epi32 = _mm256_set1_epi32 (gQuantizationCoherence );
952- __m256i idx_epi32 = _mm256_mullo_epi32 (gQuantizationCoherence_epi32 ,
953- _mm256_mullo_epi32 (angleIdx_epi32, _mm256_set1_epi32 (gQuantizationStrength )));
954- idx_epi32 = _mm256_add_epi32 (coherenceIdx_epi32,
955- _mm256_add_epi32 (idx_epi32, _mm256_mullo_epi32 (strengthIdx_epi32, gQuantizationCoherence_epi32 )));
956- _mm256_storeu_si256 ((__m256i *)idx, idx_epi32);
957- }
958-
959724inline int int_floor (float x)
960725{
961726 int i = (int )x; /* truncate */
0 commit comments