OpenVisualCloud
diff --git a/‎Library/Raisr.cpp‎
Lines changed: 0 additions & 235 deletions b/‎Library/Raisr.cpp‎
Lines changed: 0 additions & 235 deletions
@@ -676,134 +676,6 @@ static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows
     }
 }
 
-// LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
-// rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
-// cols: stride for buffers in DT type.
-// outImageCols: stride for outImage buffer
-static void CTCountOfBitsChangedSegment_AVX256_32f(float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols, const int outImageCols)
-{
-    int rowStartOffset = blendingZone.first - startRow;
-    int rowEndOffset = blendingZone.second - startRow;
-
-    const __m256 zero_ps = _mm256_setzero_ps();
-    const __m256 one_ps = _mm256_set1_ps(1.0);
-    const int cmp_le = _CMP_LT_OQ;
-    const __m256i one_epi32 = _mm256_set1_epi32(1);
-
-    for (auto r = rowStartOffset; r < rowEndOffset; r++)
-    {
-        const int c_limit = (cols - CTmargin);
-        int c_limit_avx = c_limit - (c_limit%8)+1;
-        for (auto c = CTmargin; c < c_limit_avx; c+=8)
-        {
-            __m256i hammingDistance_epi32 = _mm256_setzero_si256();
-
-            __m256 center_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + c]);
-            __m256 n1_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c-1)]);
-            __m256 n2_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c)]);
-            __m256 n3_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c+1)]);
-            __m256 n4_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + (c-1)]);
-            __m256 n5_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + (c+1)]);
-            __m256 n6_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c-1)]);
-            __m256 n7_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c)]);
-            __m256 n8_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c+1)]);
-
-            __m256 center_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + c]);
-            __m256 n1_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c-1)]);
-            __m256 n2_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c)]);
-            __m256 n3_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c+1)]);
-            __m256 n4_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + (c-1)]);
-            __m256 n5_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + (c+1)]);
-            __m256 n6_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c-1)]);
-            __m256 n7_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c)]);
-            __m256 n8_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c+1)]);
-
-            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
-                                        _mm256_abs_epi32(_mm256_sub_epi32(
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n1_LR_ps, center_LR_ps, cmp_le))),
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n1_HR_ps, center_HR_ps, cmp_le))))));
-            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
-                                        _mm256_abs_epi32(_mm256_sub_epi32(
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n2_LR_ps, center_LR_ps, cmp_le))),
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n2_HR_ps, center_HR_ps, cmp_le))))));
-            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
-                                        _mm256_abs_epi32(_mm256_sub_epi32(
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n3_LR_ps, center_LR_ps, cmp_le))),
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n3_HR_ps, center_HR_ps, cmp_le))))));
-            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
-                                        _mm256_abs_epi32(_mm256_sub_epi32(
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n4_LR_ps, center_LR_ps, cmp_le))),
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n4_HR_ps, center_HR_ps, cmp_le))))));
-            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
-                                        _mm256_abs_epi32(_mm256_sub_epi32(
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n5_LR_ps, center_LR_ps, cmp_le))),
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n5_HR_ps, center_HR_ps, cmp_le))))));
-            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
-                                        _mm256_abs_epi32(_mm256_sub_epi32(
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n6_LR_ps, center_LR_ps, cmp_le))),
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n6_HR_ps, center_HR_ps, cmp_le))))));
-            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
-                                        _mm256_abs_epi32(_mm256_sub_epi32(
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n7_LR_ps, center_LR_ps, cmp_le))),
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n7_HR_ps, center_HR_ps, cmp_le))))));
-            hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
-                                        _mm256_abs_epi32(_mm256_sub_epi32(
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n8_LR_ps, center_LR_ps, cmp_le))),
-                                            _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n8_HR_ps, center_HR_ps, cmp_le))))));
-
-            __m256 weight_ps = _mm256_div_ps( _mm256_cvtepi32_ps(hammingDistance_epi32), _mm256_set1_ps((float) CTnumberofPixel) );
-            __m256 weight2_ps = _mm256_sub_ps(one_ps, weight_ps);
-            __m256 val_ps = _mm256_add_ps( _mm256_mul_ps( weight_ps, center_LR_ps),
-                                            _mm256_mul_ps(weight2_ps, center_HR_ps));
-            val_ps = _mm256_add_ps( val_ps, _mm256_set1_ps(0.5));
-            __m256i val_epi32 = _mm256_cvtps_epi32(_mm256_floor_ps(val_ps)), val_epi16, val_epu8, val_epu16, perm_epu;
-            int64_t val_epu8_64_t;
-            if (gBitDepth == 8) {
-                val_epi32 = _mm256_max_epi32(_mm256_min_epi32( val_epi32, _mm256_set1_epi32(gMax8bit)), _mm256_set1_epi32(gMin8bit));
-                val_epi16 = _mm256_packs_epi32(val_epi32,val_epi32);
-                val_epu8 = _mm256_packus_epi16(val_epi16, val_epi16);
-                perm_epu = _mm256_permutevar8x32_epi32(val_epu8, _mm256_setr_epi32(0,4,0,4,0,4,0,4));
-                val_epu8_64_t = (_mm_cvtsi128_si64(_mm256_extractf128_si256(perm_epu, 0)));
-                memcpy((void *) &outImage[(startRow + r) * outImageCols + c], (void *) &val_epu8_64_t, 8);
-            }
-            else {
-                val_epi32 = _mm256_max_epi32(_mm256_min_epi32( val_epi32, _mm256_set1_epi32(gMax16bit)), _mm256_set1_epi32(gMin16bit));
-                val_epu16 = _mm256_packus_epi32(val_epi32,val_epi32);
-                perm_epu = _mm256_permute4x64_epi64(val_epu16, 0x88);
-                unsigned short *out = (unsigned short *)outImage;
-                _mm_storeu_si128((__m128i *) &out[(startRow + r) * outImageCols + c], _mm256_extractf128_si256(perm_epu, 0));
-            }
-        }
-
-        for (auto c = c_limit_avx; c < c_limit; c++) // handle edge, too small for SIMD
-        {
-            int hammingDistance = 0;
-
-            // Census transform
-            for (int i = -CTmargin; i <= CTmargin; i++)
-            {
-                for (int j = -CTmargin; j <= CTmargin; j++)
-                {
-                    if (unlikely(i == 0 && j == 0))
-                        continue;
-                    hammingDistance += std::abs((LRImage[(r + i) * cols + (c + j)] < LRImage[r * cols + c] ? 1 : 0) - (HRImage[(r + i) * cols + (c + j)] < HRImage[r * cols + c] ? 1 : 0));
-                }
-            }
-            float weight = (float)hammingDistance / (float)CTnumberofPixel;
-            float val = weight * LRImage[r * cols + c] + (1 - weight) * HRImage[r * cols + c];
-            val += 0.5; // to round the value
-            //convert 32f to 8bit/10bit
-            if (gBitDepth == 8) {
-                outImage[(startRow + r) * outImageCols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
-            }
-            else {
-                unsigned short *out = (unsigned short *)outImage;
-                out[(startRow + r) * outImageCols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
-            }
-        }
-    }
-}
-
 int inline CTRandomness_C(float *inYUpscaled32f, int cols, int r, int c, int pix)
 {
     // Census transform
@@ -849,113 +721,6 @@ inline float atan2Approximation(float y, float x)
 }
 #endif
 
-inline __m256 atan2Approximation_AVX256_32f(__m256 y_ps, __m256 x_ps)
-{
-    const float ONEQTR_PI = M_PI / 4.0;
-    const float THRQTR_PI = 3.0 * M_PI / 4.0;
-    const __m256 zero_ps = _mm256_set1_ps(0.0);
-    const __m256 oneqtr_pi_ps = _mm256_set1_ps(ONEQTR_PI);
-    const __m256 thrqtr_pi_ps = _mm256_set1_ps(THRQTR_PI);
-
-    __m256 abs_y_ps = _mm256_add_ps( _mm256_andnot_ps( _mm256_set1_ps(-0.0f), y_ps),
-                                     _mm256_set1_ps(1e-10f));
-
-    __m256 r_cond1_ps = _mm256_div_ps( _mm256_add_ps(x_ps, abs_y_ps), _mm256_sub_ps(abs_y_ps, x_ps));
-    __m256 r_cond2_ps = _mm256_div_ps( _mm256_sub_ps(x_ps, abs_y_ps), _mm256_add_ps(x_ps, abs_y_ps));
-    __m256 r_cmp_ps =  _mm256_cmp_ps(x_ps, zero_ps, _CMP_LT_OQ);
-    __m256 r_ps = _mm256_blendv_ps( r_cond2_ps, r_cond1_ps, r_cmp_ps);
-    __m256 angle_ps = _mm256_blendv_ps( oneqtr_pi_ps, thrqtr_pi_ps, r_cmp_ps );
-
-    angle_ps = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(0.1963f), r_ps),
-                                                                                    r_ps, _mm256_set1_ps(-0.9817f)),
-                                                                                    r_ps, angle_ps);
-
-    __m256 neg_angle_ps = _mm256_mul_ps(_mm256_set1_ps(-1), angle_ps);
-    return _mm256_blendv_ps( angle_ps, neg_angle_ps, _mm256_cmp_ps(y_ps, zero_ps, _CMP_LT_OQ));
-}
-
-void inline GetHashValue_AVX256_32f(float GTWG[8][4], int passIdx, int32_t *idx) {
-    const float one = 1.0;
-    const float two = 2.0;
-    const float four = 4.0;
-    const float pi = PI;
-    const float near_zero = 0.00000000000000001;
-    const __m256 zero_ps = _mm256_setzero_ps();
-    const __m256i zero_epi32 = _mm256_setzero_si256();
-    const __m256i one_epi32 = _mm256_set1_epi32(1);
-    const __m256i two_epi32 = _mm256_set1_epi32(2);
-
-    const int cmp_le = _CMP_LE_OQ;
-    const int cmp_gt = _CMP_GT_OQ;
-
-    __m256 m_a_ps = _mm256_setr_ps (GTWG[0][0], GTWG[1][0], GTWG[2][0], GTWG[3][0],
-                                   GTWG[4][0], GTWG[5][0], GTWG[6][0], GTWG[7][0]);
-    __m256 m_b_ps = _mm256_setr_ps (GTWG[0][1], GTWG[1][1], GTWG[2][1], GTWG[3][1],
-                                   GTWG[4][1], GTWG[5][1], GTWG[6][1], GTWG[7][1]);
-    __m256 m_d_ps = _mm256_setr_ps (GTWG[0][3], GTWG[1][3], GTWG[2][3], GTWG[3][3],
-                                   GTWG[4][3], GTWG[5][3], GTWG[6][3], GTWG[7][3]);
-    __m256 T_ps = _mm256_add_ps(m_a_ps, m_d_ps);
-    __m256 D_ps = _mm256_sub_ps( _mm256_mul_ps( m_a_ps, m_d_ps),
-                                _mm256_mul_ps( m_b_ps, m_b_ps));
-
-    __m256 sqr_ps = _mm256_sqrt_ps( _mm256_sub_ps( _mm256_div_ps ( _mm256_mul_ps(T_ps, T_ps),
-                                                                   _mm256_broadcast_ss(&four)), D_ps));
-
-    __m256 half_T_ps = _mm256_div_ps ( T_ps, _mm256_broadcast_ss(&two) );
-    __m256 L1_ps = _mm256_add_ps( half_T_ps, sqr_ps);
-    __m256 L2_ps = _mm256_sub_ps( half_T_ps, sqr_ps);
-
-    __m256 angle_ps = zero_ps;
-
-    __m256 blend_ps = _mm256_blendv_ps( _mm256_broadcast_ss(&one), _mm256_sub_ps(L1_ps, m_d_ps),
-                                    _mm256_cmp_ps(m_b_ps, zero_ps, _CMP_NEQ_OQ) );
-
-#ifdef USE_ATAN2_APPROX
-    angle_ps = atan2Approximation_AVX256_32f( m_b_ps, blend_ps);
-#else
-    angle_ps = _mm256_atan2_ps( m_b_ps, blend_ps);
-#endif
-
-    angle_ps = _mm256_add_ps ( angle_ps, _mm256_blendv_ps( zero_ps, _mm256_broadcast_ss(&pi),
-                                    _mm256_cmp_ps(angle_ps, zero_ps, _CMP_LT_OQ) ) );
-
-    __m256 sqrtL1_ps = _mm256_sqrt_ps( L1_ps );
-    __m256 sqrtL2_ps = _mm256_sqrt_ps( L2_ps );
-    __m256 coherence_ps = _mm256_div_ps( _mm256_sub_ps( sqrtL1_ps, sqrtL2_ps ),
-                                        _mm256_add_ps( _mm256_add_ps(sqrtL1_ps, sqrtL2_ps), _mm256_broadcast_ss(&near_zero) ) );
-    __m256 strength_ps = L1_ps;
-
-    __m256i angleIdx_epi32 = _mm256_cvtps_epi32( _mm256_floor_ps(_mm256_mul_ps (angle_ps, _mm256_broadcast_ss(&gQAngle))));
-
-    angleIdx_epi32 = _mm256_min_epi32( _mm256_sub_epi32( _mm256_set1_epi32(gQuantizationAngle), _mm256_set1_epi32(1)),
-                                       _mm256_max_epi32(angleIdx_epi32, zero_epi32 ) );
-
-   // AFAIK, today QStr & QCoh are vectors of size 2.  I think searchsorted can return an index of 0,1, or 2
-    float *gQStr_data, *gQCoh_data;
-    if (passIdx == 0) gQStr_data = gQStr.data(); else gQStr_data = gQStr2.data();
-    if (passIdx == 0) gQCoh_data = gQCoh.data(); else gQCoh_data = gQCoh2.data();
-    __m256 gQStr1_ps = _mm256_broadcast_ss(gQStr_data);
-    __m256 gQStr2_ps = _mm256_broadcast_ss(gQStr_data + 1);
-    __m256 gQCoh1_ps = _mm256_broadcast_ss(gQCoh_data);
-    __m256 gQCoh2_ps = _mm256_broadcast_ss(gQCoh_data + 1);
-
-   __m256i strengthIdx_epi32 = _mm256_sub_epi32(two_epi32,
-                                    _mm256_add_epi32(
-                                    _mm256_and_si256(one_epi32, _mm256_castps_si256( _mm256_cmp_ps(strength_ps, gQStr1_ps, cmp_le))),
-                                    _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(strength_ps, gQStr2_ps, cmp_le)))));
-    __m256i coherenceIdx_epi32 = _mm256_sub_epi32(two_epi32,
-                                    _mm256_add_epi32(
-                                    _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(coherence_ps, gQCoh1_ps, cmp_le))),
-                                    _mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(coherence_ps, gQCoh2_ps, cmp_le)))));
-
-    const __m256i gQuantizationCoherence_epi32 = _mm256_set1_epi32(gQuantizationCoherence);
-    __m256i idx_epi32 = _mm256_mullo_epi32(gQuantizationCoherence_epi32,
-                                            _mm256_mullo_epi32(angleIdx_epi32, _mm256_set1_epi32(gQuantizationStrength)));
-    idx_epi32 = _mm256_add_epi32(coherenceIdx_epi32,
-                                _mm256_add_epi32(idx_epi32, _mm256_mullo_epi32(strengthIdx_epi32, gQuantizationCoherence_epi32)));
-    _mm256_storeu_si256((__m256i *)idx, idx_epi32);
-}
-
 inline int int_floor(float x)
 {
     int i = (int)x;     /* truncate */