Skip to content

Commit c62f762

Browse files
cabirdmexiaoxial
authored andcommitted
restructured raisr code into separate files in preparation for deliver of more avx2 files
1 parent 252d33d commit c62f762

6 files changed

Lines changed: 509 additions & 438 deletions

File tree

Library/Raisr.cpp

Lines changed: 7 additions & 438 deletions
Large diffs are not rendered by default.

Library/Raisr_AVX256.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/**
2+
* Intel Library for Video Super Resolution
3+
*
4+
* Copyright (c) 2022 Intel Corporation
5+
* All rights reserved.
6+
*/
7+
#include "Raisr_globals.h"
8+
#include "Raisr_AVX256.h"
9+
#include <immintrin.h>
10+
11+
12+
13+
inline __m256i compare3x3_ps(__m256 a, __m256 b, __m256i highbit_epi32)
14+
{
15+
// compare if neighbors < centerpixel, toggle bit in mask if true
16+
// when cmp_ps is true, it returns 0x7fffff (-nan). When we convert that to int, it is 0x8000 0000
17+
18+
return _mm256_srli_epi32(_mm256_and_si256(_mm256_cvtps_epi32(
19+
_mm256_cmp_ps(a, b, _CMP_LT_OS)),
20+
highbit_epi32),
21+
31); // shift right by 31 such that the high bit (if set) moves to the low bit
22+
}
23+
24+
inline int sumitup_256_epi32(__m256i acc)
25+
{
26+
const __m128i r4 = _mm_add_epi32(_mm256_castsi256_si128(acc), _mm256_extractf128_si256(acc, 1));
27+
const __m128i r2 = _mm_hadd_epi32(r4, r4);
28+
const __m128i r1 = _mm_hadd_epi32(r2, r2);
29+
return _mm_cvtsi128_si32(r1);
30+
}
31+
32+
int inline CTRandomness_AVX256_32f(float *inYUpscaled32f, int cols, int r, int c, int pix)
33+
{
34+
int census_count = 0;
35+
36+
__m128 zero_f = _mm_setzero_ps();
37+
__m256 row_f, center_f;
38+
39+
load3x3_ps(inYUpscaled32f, c + pix, r, cols, &row_f, &center_f);
40+
41+
// compare if neighbors < centerpixel, toggle bit in mask if true
42+
int highbit = 0x80000000;
43+
const __m256i highbit_epi32 = _mm256_setr_epi32(highbit, highbit, highbit, highbit, highbit, highbit, highbit, highbit);
44+
45+
__m256i cmp_epi32 = compare3x3_ps(row_f, center_f, highbit_epi32);
46+
47+
// count # of bits in mask
48+
census_count += sumitup_256_epi32(cmp_epi32);
49+
50+
return census_count;
51+
}
52+
53+
inline float sumitup_ps_256(__m256 acc)
54+
{
55+
const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(acc), _mm256_extractf128_ps(acc, 1));
56+
const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
57+
const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
58+
return _mm_cvtss_f32(r1);
59+
}

Library/Raisr_AVX256.h

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/**
2+
* Intel Library for Video Super Resolution
3+
*
4+
* Copyright (c) 2022 Intel Corporation
5+
* All rights reserved.
6+
*/
7+
#pragma once
8+
#include <immintrin.h>
9+
10+
inline void load3x3_ps(float *img, unsigned int width, unsigned int height, unsigned int stride, __m256 *out_8neighbors_ps, __m256 *out_center_ps)
11+
{
12+
__m128i mask_3pixels = _mm_setr_epi32(-1, -1, -1, 0);
13+
int index = (height - 1) * stride + (width - 1);
14+
// load 3x3 grid for lr image, including center pixel plus 8 neighbors
15+
__m128 row1_f = _mm_maskload_ps(img + index, mask_3pixels);
16+
index += stride;
17+
__m128 row2_f = _mm_maskload_ps(img + index, mask_3pixels);
18+
index += stride;
19+
__m128 row3_f = _mm_maskload_ps(img + index, mask_3pixels);
20+
21+
*out_center_ps = _mm256_broadcastss_ps(_mm_insert_ps(row2_f, row2_f, 0x40));
22+
// load 8 neighbors (32bit floats) into 256 reg from lr image
23+
__m128 rowlo_f = _mm_insert_ps(row1_f, row2_f, 0x30);
24+
__m128 rowhi_f = _mm_insert_ps(row3_f, row2_f, 0xB0);
25+
*out_8neighbors_ps = _mm256_insertf128_ps(_mm256_castps128_ps256(rowlo_f), rowhi_f, 1);
26+
}
27+
28+
29+
inline __m256i compare3x3_ps(__m256 a, __m256 b, __m256i highbit_epi32);
30+
inline int sumitup_256_epi32(__m256i acc);
31+
int inline CTRandomness_AVX256_32f(float *inYUpscaled32f, int cols, int r, int c, int pix);
32+
inline float sumitup_ps_256(__m256 acc);

Library/Raisr_AVX512.cpp

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
/**
2+
* Intel Library for Video Super Resolution
3+
*
4+
* Copyright (c) 2022 Intel Corporation
5+
* All rights reserved.
6+
*/
7+
#include "Raisr_globals.h"
8+
#include "Raisr_AVX512.h"
9+
#include <immintrin.h>
10+
#include <popcntintrin.h>
11+
12+
inline __mmask8 compare3x3_ps_AVX512(__m256 a, __m256 b)
13+
{
14+
return _mm256_cmp_ps_mask(a, b, _CMP_LT_OS);
15+
}
16+
17+
int inline CTRandomness_AVX512_32f(float *inYUpscaled32f, int cols, int r, int c, int pix)
18+
{
19+
int census_count = 0;
20+
21+
__m128 zero_f = _mm_setzero_ps();
22+
__m256 row_f, center_f;
23+
24+
load3x3_ps(inYUpscaled32f, c + pix, r, cols, &row_f, &center_f);
25+
26+
// compare if neighbors < centerpixel, toggle bit in mask if true
27+
__mmask8 cmp_m8 = compare3x3_ps_AVX512(row_f, center_f);
28+
29+
// count # of bits in mask
30+
census_count += _mm_popcnt_u32(cmp_m8);
31+
32+
return census_count;
33+
}
34+
35+
inline float sumitup_ps_512(__m512 acc)
36+
{
37+
const __m256 r8 = _mm256_add_ps(_mm512_castps512_ps256(acc), _mm512_extractf32x8_ps(acc, 1));
38+
const __m128 r4 = _mm_add_ps(_mm256_castps256_ps128(r8), _mm256_extractf128_ps(r8, 1));
39+
const __m128 r2 = _mm_add_ps(r4, _mm_movehl_ps(r4, r4));
40+
const __m128 r1 = _mm_add_ss(r2, _mm_movehdup_ps(r2));
41+
return _mm_cvtss_f32(r1);
42+
}
43+
inline __m512 shiftL(__m512 r)
44+
{
45+
return _mm512_permutexvar_ps(_mm512_set_epi32(0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1), r);
46+
}
47+
inline __m512 shiftR(__m512 r)
48+
{
49+
return _mm512_permutexvar_ps(_mm512_set_epi32(14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15), r);
50+
}
51+
52+
inline __m512 GetGx(__m512 r1, __m512 r3)
53+
{
54+
return _mm512_sub_ps(r3, r1);
55+
}
56+
57+
inline __m512 GetGy(__m512 r2)
58+
{
59+
return _mm512_sub_ps(shiftL(r2), shiftR(r2));
60+
}
61+
62+
inline __m512 GetGTWG(__m512 acc, __m512 a, __m512 w, __m512 b)
63+
{
64+
return _mm512_fmadd_ps(_mm512_mul_ps(a, w), b, acc);
65+
}
66+
67+
void inline computeGTWG_Segment_AVX512_32f(const float *img, const int nrows, const int ncols, const int r, const int col, float GTWG[][4], float *buf1, float *buf2)
68+
{
69+
// offset is the starting position(top left) of the block which centered by (r, c)
70+
int offset = (r - gLoopMargin) * ncols + col - gLoopMargin;
71+
const float *p1 = img + offset;
72+
73+
__m512 gtwg0A = _mm512_setzero_ps(), gtwg1A = _mm512_setzero_ps(), gtwg3A = _mm512_setzero_ps();
74+
__m512 gtwg0B = _mm512_setzero_ps(), gtwg1B = _mm512_setzero_ps(), gtwg3B = _mm512_setzero_ps();
75+
76+
// load 2 rows
77+
__m512 a = _mm512_loadu_ps(p1);
78+
p1 += ncols;
79+
__m512 b = _mm512_loadu_ps(p1);
80+
#pragma unroll
81+
for (int i = 0; i < gPatchSize; i++)
82+
{
83+
// memcpy(buf1+gPatchSize*i, p1+1, sizeof(float)*gPatchSize);
84+
// memcpy(buf2+gPatchSize*i, p1+2, sizeof(float)*gPatchSize);
85+
86+
// process patchSize rows
87+
// load next row
88+
p1 += ncols;
89+
__m512 c = _mm512_loadu_ps(p1);
90+
__m512 w;
91+
if (gBitDepth == 8)
92+
{
93+
w = _mm512_loadu_ps(gGaussian2D8bit[i]);
94+
}
95+
else if (gBitDepth == 10)
96+
{
97+
w = _mm512_loadu_ps(gGaussian2D10bit[i]);
98+
}
99+
else
100+
{
101+
w = _mm512_loadu_ps(gGaussian2D16bit[i]);
102+
}
103+
104+
const __m512 gxi = GetGx(a, c);
105+
const __m512 gyi = GetGy(b);
106+
107+
gtwg0A = GetGTWG(gtwg0A, gxi, w, gxi);
108+
gtwg1A = GetGTWG(gtwg1A, gxi, w, gyi);
109+
gtwg3A = GetGTWG(gtwg3A, gyi, w, gyi);
110+
111+
w = shiftR(w);
112+
gtwg0B = GetGTWG(gtwg0B, gxi, w, gxi);
113+
gtwg1B = GetGTWG(gtwg1B, gxi, w, gyi);
114+
gtwg3B = GetGTWG(gtwg3B, gyi, w, gyi);
115+
116+
_mm512_mask_storeu_ps(buf1 + gPatchSize * i - 1, 0x0ffe, b);
117+
_mm512_mask_storeu_ps(buf2 + gPatchSize * i - 2, 0x1ffc, b);
118+
a = b;
119+
b = c;
120+
}
121+
GTWG[0][0] = sumitup_ps_512(gtwg0A);
122+
GTWG[0][1] = sumitup_ps_512(gtwg1A);
123+
GTWG[0][3] = sumitup_ps_512(gtwg3A);
124+
GTWG[0][2] = GTWG[0][1];
125+
126+
GTWG[1][0] = sumitup_ps_512(gtwg0B);
127+
GTWG[1][1] = sumitup_ps_512(gtwg1B);
128+
GTWG[1][3] = sumitup_ps_512(gtwg3B);
129+
GTWG[1][2] = GTWG[1][1];
130+
131+
return;
132+
}
133+
134+
// AVX512 version: for now, gPatchSize must be <= 16 because we can work with up to 16 float32s in one AVX512 register.
135+
float inline DotProdPatch_AVX512_32f(const float *buf, const float *filter)
136+
{
137+
__m512 a_ps = _mm512_load_ps(buf);
138+
__m512 b_ps = _mm512_load_ps(filter);
139+
__m512 sum = _mm512_mul_ps(a_ps, b_ps);
140+
#pragma unroll
141+
for (int i = 1; i < 8; i++)
142+
{
143+
a_ps = _mm512_load_ps(buf + i * 16);
144+
b_ps = _mm512_load_ps(filter + i * 16);
145+
// compute dot prod using fmadd
146+
sum = _mm512_fmadd_ps(a_ps, b_ps, sum);
147+
}
148+
// sumitup adds all 16 float values in sum(zmm) and returns a single float value
149+
return sumitup_ps_512(sum);
150+
}

Library/Raisr_AVX512.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
/**
2+
* Intel Library for Video Super Resolution
3+
*
4+
* Copyright (c) 2022 Intel Corporation
5+
* All rights reserved.
6+
*/
7+
#pragma once
8+
#include <immintrin.h>
9+
10+
inline __mmask8 compare3x3_ps_AVX512(__m256 a, __m256 b);
11+
inline float sumitup_ps_512(__m512 acc);
12+
inline __m512 shiftL(__m512 r);
13+
inline __m512 shiftR(__m512 r);
14+
inline __m512 GetGx(__m512 r1, __m512 r3);
15+
inline __m512 GetGy(__m512 r2);
16+
inline __m512 GetGTWG(__m512 acc, __m512 a, __m512 w, __m512 b);
17+
18+
void inline computeGTWG_Segment_AVX512_32f(const float *img, const int nrows, const int ncols, const int r, const int col, float GTWG[][4], float *buf1, float *buf2);
19+
int inline CTRandomness_AVX512_32f(float *inYUpscaled32f, int cols, int r, int c, int pix);
20+
float inline DotProdPatch_AVX512_32f(const float *buf, const float *filter);

0 commit comments

Comments
 (0)