Skip to content

Commit 140f955

Browse files
committed
move the avx2 functions from Raisr.cpp to Raisr_AVX256.cpp
Signed-off-by: Xiaoxia Liang <xiaoxia.liang@intel.com>
1 parent 17bc385 commit 140f955

3 files changed

Lines changed: 240 additions & 235 deletions

File tree

Library/Raisr.cpp

Lines changed: 0 additions & 235 deletions
Original file line numberDiff line numberDiff line change
@@ -676,134 +676,6 @@ static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows
676676
}
677677
}
678678

679-
// LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
680-
// rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
681-
// cols: stride for buffers in DT type.
682-
// outImageCols: stride for outImage buffer
683-
static void CTCountOfBitsChangedSegment_AVX256_32f(float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols, const int outImageCols)
684-
{
685-
int rowStartOffset = blendingZone.first - startRow;
686-
int rowEndOffset = blendingZone.second - startRow;
687-
688-
const __m256 zero_ps = _mm256_setzero_ps();
689-
const __m256 one_ps = _mm256_set1_ps(1.0);
690-
const int cmp_le = _CMP_LT_OQ;
691-
const __m256i one_epi32 = _mm256_set1_epi32(1);
692-
693-
for (auto r = rowStartOffset; r < rowEndOffset; r++)
694-
{
695-
const int c_limit = (cols - CTmargin);
696-
int c_limit_avx = c_limit - (c_limit%8)+1;
697-
for (auto c = CTmargin; c < c_limit_avx; c+=8)
698-
{
699-
__m256i hammingDistance_epi32 = _mm256_setzero_si256();
700-
701-
__m256 center_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + c]);
702-
__m256 n1_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c-1)]);
703-
__m256 n2_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c)]);
704-
__m256 n3_LR_ps = _mm256_loadu_ps( &LRImage[(r-1) * cols + (c+1)]);
705-
__m256 n4_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + (c-1)]);
706-
__m256 n5_LR_ps = _mm256_loadu_ps( &LRImage[(r) * cols + (c+1)]);
707-
__m256 n6_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c-1)]);
708-
__m256 n7_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c)]);
709-
__m256 n8_LR_ps = _mm256_loadu_ps( &LRImage[(r+1) * cols + (c+1)]);
710-
711-
__m256 center_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + c]);
712-
__m256 n1_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c-1)]);
713-
__m256 n2_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c)]);
714-
__m256 n3_HR_ps = _mm256_loadu_ps( &HRImage[(r-1) * cols + (c+1)]);
715-
__m256 n4_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + (c-1)]);
716-
__m256 n5_HR_ps = _mm256_loadu_ps( &HRImage[(r) * cols + (c+1)]);
717-
__m256 n6_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c-1)]);
718-
__m256 n7_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c)]);
719-
__m256 n8_HR_ps = _mm256_loadu_ps( &HRImage[(r+1) * cols + (c+1)]);
720-
721-
hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
722-
_mm256_abs_epi32(_mm256_sub_epi32(
723-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n1_LR_ps, center_LR_ps, cmp_le))),
724-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n1_HR_ps, center_HR_ps, cmp_le))))));
725-
hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
726-
_mm256_abs_epi32(_mm256_sub_epi32(
727-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n2_LR_ps, center_LR_ps, cmp_le))),
728-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n2_HR_ps, center_HR_ps, cmp_le))))));
729-
hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
730-
_mm256_abs_epi32(_mm256_sub_epi32(
731-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n3_LR_ps, center_LR_ps, cmp_le))),
732-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n3_HR_ps, center_HR_ps, cmp_le))))));
733-
hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
734-
_mm256_abs_epi32(_mm256_sub_epi32(
735-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n4_LR_ps, center_LR_ps, cmp_le))),
736-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n4_HR_ps, center_HR_ps, cmp_le))))));
737-
hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
738-
_mm256_abs_epi32(_mm256_sub_epi32(
739-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n5_LR_ps, center_LR_ps, cmp_le))),
740-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n5_HR_ps, center_HR_ps, cmp_le))))));
741-
hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
742-
_mm256_abs_epi32(_mm256_sub_epi32(
743-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n6_LR_ps, center_LR_ps, cmp_le))),
744-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n6_HR_ps, center_HR_ps, cmp_le))))));
745-
hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
746-
_mm256_abs_epi32(_mm256_sub_epi32(
747-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n7_LR_ps, center_LR_ps, cmp_le))),
748-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n7_HR_ps, center_HR_ps, cmp_le))))));
749-
hammingDistance_epi32 = _mm256_add_epi32( hammingDistance_epi32,
750-
_mm256_abs_epi32(_mm256_sub_epi32(
751-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n8_LR_ps, center_LR_ps, cmp_le))),
752-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(n8_HR_ps, center_HR_ps, cmp_le))))));
753-
754-
__m256 weight_ps = _mm256_div_ps( _mm256_cvtepi32_ps(hammingDistance_epi32), _mm256_set1_ps((float) CTnumberofPixel) );
755-
__m256 weight2_ps = _mm256_sub_ps(one_ps, weight_ps);
756-
__m256 val_ps = _mm256_add_ps( _mm256_mul_ps( weight_ps, center_LR_ps),
757-
_mm256_mul_ps(weight2_ps, center_HR_ps));
758-
val_ps = _mm256_add_ps( val_ps, _mm256_set1_ps(0.5));
759-
__m256i val_epi32 = _mm256_cvtps_epi32(_mm256_floor_ps(val_ps)), val_epi16, val_epu8, val_epu16, perm_epu;
760-
int64_t val_epu8_64_t;
761-
if (gBitDepth == 8) {
762-
val_epi32 = _mm256_max_epi32(_mm256_min_epi32( val_epi32, _mm256_set1_epi32(gMax8bit)), _mm256_set1_epi32(gMin8bit));
763-
val_epi16 = _mm256_packs_epi32(val_epi32,val_epi32);
764-
val_epu8 = _mm256_packus_epi16(val_epi16, val_epi16);
765-
perm_epu = _mm256_permutevar8x32_epi32(val_epu8, _mm256_setr_epi32(0,4,0,4,0,4,0,4));
766-
val_epu8_64_t = (_mm_cvtsi128_si64(_mm256_extractf128_si256(perm_epu, 0)));
767-
memcpy((void *) &outImage[(startRow + r) * outImageCols + c], (void *) &val_epu8_64_t, 8);
768-
}
769-
else {
770-
val_epi32 = _mm256_max_epi32(_mm256_min_epi32( val_epi32, _mm256_set1_epi32(gMax16bit)), _mm256_set1_epi32(gMin16bit));
771-
val_epu16 = _mm256_packus_epi32(val_epi32,val_epi32);
772-
perm_epu = _mm256_permute4x64_epi64(val_epu16, 0x88);
773-
unsigned short *out = (unsigned short *)outImage;
774-
_mm_storeu_si128((__m128i *) &out[(startRow + r) * outImageCols + c], _mm256_extractf128_si256(perm_epu, 0));
775-
}
776-
}
777-
778-
for (auto c = c_limit_avx; c < c_limit; c++) // handle edge, too small for SIMD
779-
{
780-
int hammingDistance = 0;
781-
782-
// Census transform
783-
for (int i = -CTmargin; i <= CTmargin; i++)
784-
{
785-
for (int j = -CTmargin; j <= CTmargin; j++)
786-
{
787-
if (unlikely(i == 0 && j == 0))
788-
continue;
789-
hammingDistance += std::abs((LRImage[(r + i) * cols + (c + j)] < LRImage[r * cols + c] ? 1 : 0) - (HRImage[(r + i) * cols + (c + j)] < HRImage[r * cols + c] ? 1 : 0));
790-
}
791-
}
792-
float weight = (float)hammingDistance / (float)CTnumberofPixel;
793-
float val = weight * LRImage[r * cols + c] + (1 - weight) * HRImage[r * cols + c];
794-
val += 0.5; // to round the value
795-
//convert 32f to 8bit/10bit
796-
if (gBitDepth == 8) {
797-
outImage[(startRow + r) * outImageCols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
798-
}
799-
else {
800-
unsigned short *out = (unsigned short *)outImage;
801-
out[(startRow + r) * outImageCols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
802-
}
803-
}
804-
}
805-
}
806-
807679
int inline CTRandomness_C(float *inYUpscaled32f, int cols, int r, int c, int pix)
808680
{
809681
// Census transform
@@ -849,113 +721,6 @@ inline float atan2Approximation(float y, float x)
849721
}
850722
#endif
851723

852-
inline __m256 atan2Approximation_AVX256_32f(__m256 y_ps, __m256 x_ps)
853-
{
854-
const float ONEQTR_PI = M_PI / 4.0;
855-
const float THRQTR_PI = 3.0 * M_PI / 4.0;
856-
const __m256 zero_ps = _mm256_set1_ps(0.0);
857-
const __m256 oneqtr_pi_ps = _mm256_set1_ps(ONEQTR_PI);
858-
const __m256 thrqtr_pi_ps = _mm256_set1_ps(THRQTR_PI);
859-
860-
__m256 abs_y_ps = _mm256_add_ps( _mm256_andnot_ps( _mm256_set1_ps(-0.0f), y_ps),
861-
_mm256_set1_ps(1e-10f));
862-
863-
__m256 r_cond1_ps = _mm256_div_ps( _mm256_add_ps(x_ps, abs_y_ps), _mm256_sub_ps(abs_y_ps, x_ps));
864-
__m256 r_cond2_ps = _mm256_div_ps( _mm256_sub_ps(x_ps, abs_y_ps), _mm256_add_ps(x_ps, abs_y_ps));
865-
__m256 r_cmp_ps = _mm256_cmp_ps(x_ps, zero_ps, _CMP_LT_OQ);
866-
__m256 r_ps = _mm256_blendv_ps( r_cond2_ps, r_cond1_ps, r_cmp_ps);
867-
__m256 angle_ps = _mm256_blendv_ps( oneqtr_pi_ps, thrqtr_pi_ps, r_cmp_ps );
868-
869-
angle_ps = _mm256_fmadd_ps(_mm256_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(0.1963f), r_ps),
870-
r_ps, _mm256_set1_ps(-0.9817f)),
871-
r_ps, angle_ps);
872-
873-
__m256 neg_angle_ps = _mm256_mul_ps(_mm256_set1_ps(-1), angle_ps);
874-
return _mm256_blendv_ps( angle_ps, neg_angle_ps, _mm256_cmp_ps(y_ps, zero_ps, _CMP_LT_OQ));
875-
}
876-
877-
void inline GetHashValue_AVX256_32f(float GTWG[8][4], int passIdx, int32_t *idx) {
878-
const float one = 1.0;
879-
const float two = 2.0;
880-
const float four = 4.0;
881-
const float pi = PI;
882-
const float near_zero = 0.00000000000000001;
883-
const __m256 zero_ps = _mm256_setzero_ps();
884-
const __m256i zero_epi32 = _mm256_setzero_si256();
885-
const __m256i one_epi32 = _mm256_set1_epi32(1);
886-
const __m256i two_epi32 = _mm256_set1_epi32(2);
887-
888-
const int cmp_le = _CMP_LE_OQ;
889-
const int cmp_gt = _CMP_GT_OQ;
890-
891-
__m256 m_a_ps = _mm256_setr_ps (GTWG[0][0], GTWG[1][0], GTWG[2][0], GTWG[3][0],
892-
GTWG[4][0], GTWG[5][0], GTWG[6][0], GTWG[7][0]);
893-
__m256 m_b_ps = _mm256_setr_ps (GTWG[0][1], GTWG[1][1], GTWG[2][1], GTWG[3][1],
894-
GTWG[4][1], GTWG[5][1], GTWG[6][1], GTWG[7][1]);
895-
__m256 m_d_ps = _mm256_setr_ps (GTWG[0][3], GTWG[1][3], GTWG[2][3], GTWG[3][3],
896-
GTWG[4][3], GTWG[5][3], GTWG[6][3], GTWG[7][3]);
897-
__m256 T_ps = _mm256_add_ps(m_a_ps, m_d_ps);
898-
__m256 D_ps = _mm256_sub_ps( _mm256_mul_ps( m_a_ps, m_d_ps),
899-
_mm256_mul_ps( m_b_ps, m_b_ps));
900-
901-
__m256 sqr_ps = _mm256_sqrt_ps( _mm256_sub_ps( _mm256_div_ps ( _mm256_mul_ps(T_ps, T_ps),
902-
_mm256_broadcast_ss(&four)), D_ps));
903-
904-
__m256 half_T_ps = _mm256_div_ps ( T_ps, _mm256_broadcast_ss(&two) );
905-
__m256 L1_ps = _mm256_add_ps( half_T_ps, sqr_ps);
906-
__m256 L2_ps = _mm256_sub_ps( half_T_ps, sqr_ps);
907-
908-
__m256 angle_ps = zero_ps;
909-
910-
__m256 blend_ps = _mm256_blendv_ps( _mm256_broadcast_ss(&one), _mm256_sub_ps(L1_ps, m_d_ps),
911-
_mm256_cmp_ps(m_b_ps, zero_ps, _CMP_NEQ_OQ) );
912-
913-
#ifdef USE_ATAN2_APPROX
914-
angle_ps = atan2Approximation_AVX256_32f( m_b_ps, blend_ps);
915-
#else
916-
angle_ps = _mm256_atan2_ps( m_b_ps, blend_ps);
917-
#endif
918-
919-
angle_ps = _mm256_add_ps ( angle_ps, _mm256_blendv_ps( zero_ps, _mm256_broadcast_ss(&pi),
920-
_mm256_cmp_ps(angle_ps, zero_ps, _CMP_LT_OQ) ) );
921-
922-
__m256 sqrtL1_ps = _mm256_sqrt_ps( L1_ps );
923-
__m256 sqrtL2_ps = _mm256_sqrt_ps( L2_ps );
924-
__m256 coherence_ps = _mm256_div_ps( _mm256_sub_ps( sqrtL1_ps, sqrtL2_ps ),
925-
_mm256_add_ps( _mm256_add_ps(sqrtL1_ps, sqrtL2_ps), _mm256_broadcast_ss(&near_zero) ) );
926-
__m256 strength_ps = L1_ps;
927-
928-
__m256i angleIdx_epi32 = _mm256_cvtps_epi32( _mm256_floor_ps(_mm256_mul_ps (angle_ps, _mm256_broadcast_ss(&gQAngle))));
929-
930-
angleIdx_epi32 = _mm256_min_epi32( _mm256_sub_epi32( _mm256_set1_epi32(gQuantizationAngle), _mm256_set1_epi32(1)),
931-
_mm256_max_epi32(angleIdx_epi32, zero_epi32 ) );
932-
933-
// AFAIK, today QStr & QCoh are vectors of size 2. I think searchsorted can return an index of 0,1, or 2
934-
float *gQStr_data, *gQCoh_data;
935-
if (passIdx == 0) gQStr_data = gQStr.data(); else gQStr_data = gQStr2.data();
936-
if (passIdx == 0) gQCoh_data = gQCoh.data(); else gQCoh_data = gQCoh2.data();
937-
__m256 gQStr1_ps = _mm256_broadcast_ss(gQStr_data);
938-
__m256 gQStr2_ps = _mm256_broadcast_ss(gQStr_data + 1);
939-
__m256 gQCoh1_ps = _mm256_broadcast_ss(gQCoh_data);
940-
__m256 gQCoh2_ps = _mm256_broadcast_ss(gQCoh_data + 1);
941-
942-
__m256i strengthIdx_epi32 = _mm256_sub_epi32(two_epi32,
943-
_mm256_add_epi32(
944-
_mm256_and_si256(one_epi32, _mm256_castps_si256( _mm256_cmp_ps(strength_ps, gQStr1_ps, cmp_le))),
945-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(strength_ps, gQStr2_ps, cmp_le)))));
946-
__m256i coherenceIdx_epi32 = _mm256_sub_epi32(two_epi32,
947-
_mm256_add_epi32(
948-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(coherence_ps, gQCoh1_ps, cmp_le))),
949-
_mm256_and_si256(one_epi32, _mm256_castps_si256(_mm256_cmp_ps(coherence_ps, gQCoh2_ps, cmp_le)))));
950-
951-
const __m256i gQuantizationCoherence_epi32 = _mm256_set1_epi32(gQuantizationCoherence);
952-
__m256i idx_epi32 = _mm256_mullo_epi32(gQuantizationCoherence_epi32,
953-
_mm256_mullo_epi32(angleIdx_epi32, _mm256_set1_epi32(gQuantizationStrength)));
954-
idx_epi32 = _mm256_add_epi32(coherenceIdx_epi32,
955-
_mm256_add_epi32(idx_epi32, _mm256_mullo_epi32(strengthIdx_epi32, gQuantizationCoherence_epi32)));
956-
_mm256_storeu_si256((__m256i *)idx, idx_epi32);
957-
}
958-
959724
inline int int_floor(float x)
960725
{
961726
int i = (int)x; /* truncate */

0 commit comments

Comments
 (0)