Skip to content

Commit c918906

Browse files
cabirdmexiaoxial
authored andcommitted
added bug back in. eliminates 5/6 GetGTWGs to improve perf by ~40%. Impact to VQ appears minimal.
1 parent b74150f commit c918906

1 file changed

Lines changed: 25 additions & 8 deletions

File tree

Library/Raisr_AVX512FP16.cpp

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ void computeGTWG_Segment_AVX512FP16_16f(const _Float16 *img, const int nrows, co
150150
// process patchSize rows
151151
// load next row
152152
p1 += ncols;
153+
153154
__m512h c = _mm512_zextph256_ph512(_mm256_loadu_ph(p1));
154155
c = _mm512_castps_ph(_mm512_insertf32x8(_mm512_castph_ps(c), _mm256_castph_ps(_mm512_castph512_ph256(c)), 1)); // duplicate high & low to compute GTWG for 2 pixels
155156
__m512h w = _mm512_loadu_ph(gGaussian2DOriginal_fp16_doubled_w1w3[i]); // pixels 1,3
@@ -158,14 +159,15 @@ void computeGTWG_Segment_AVX512FP16_16f(const _Float16 *img, const int nrows, co
158159
const __m512h gyi = GetGy_AVX512FP16(b);
159160

160161
gtwg0A = GetGTWG_AVX512FP16(gtwg0A, gxi, w, gxi);
161-
gtwg1A = GetGTWG_AVX512FP16(gtwg1A, gxi, w, gyi);
162-
gtwg3A = GetGTWG_AVX512FP16(gtwg3A, gyi, w, gyi);
163-
162+
// gtwg1A = GetGTWG_AVX512FP16(gtwg1A, gxi, w, gyi);
163+
// gtwg3A = GetGTWG_AVX512FP16(gtwg3A, gyi, w, gyi);
164+
/*
164165
w = shiftR_AVX512FP16(w); // pixels 2,4
165166
166167
gtwg0B = GetGTWG_AVX512FP16(gtwg0B, gxi, w, gxi);
167168
gtwg1B = GetGTWG_AVX512FP16(gtwg1B, gxi, w, gyi);
168169
gtwg3B = GetGTWG_AVX512FP16(gtwg3B, gyi, w, gyi);
170+
*/
169171

170172
_mm256_mask_storeu_epi16(buf1 + gPatchSize * i - 1, 0x0ffe,_mm256_castph_si256(_mm512_castph512_ph256(b)));
171173
_mm256_mask_storeu_epi16(buf2 + gPatchSize * i - 2, 0x1ffc,_mm256_castph_si256(_mm512_castph512_ph256(b)));
@@ -180,30 +182,45 @@ void computeGTWG_Segment_AVX512FP16_16f(const _Float16 *img, const int nrows, co
180182
int gtwgIdxB = gtwgIdx+2;
181183
sumitup2lane_AVX512FP16_16f(gtwg0A, &GTWG[0][gtwgIdxA], &GTWG[0][gtwgIdxB]);
182184
GTWG[0][gtwgIdxA] *= normal;
185+
GTWG[1][gtwgIdxA] = GTWG[0][gtwgIdxA];
186+
GTWG[2][gtwgIdxA] = GTWG[0][gtwgIdxA];
183187
GTWG[0][gtwgIdxB] *= normal;
188+
GTWG[1][gtwgIdxB] = GTWG[0][gtwgIdxB];
189+
GTWG[2][gtwgIdxB] = GTWG[0][gtwgIdxB];
190+
184191
// gtwg1 for pixels 0,2
192+
/*
185193
sumitup2lane_AVX512FP16_16f(gtwg1A, &GTWG[1][gtwgIdxA], &GTWG[1][gtwgIdxB]);
186194
GTWG[1][gtwgIdxA] *= normal;
187195
GTWG[1][gtwgIdxB] *= normal;
188196
// gtwg3 for pixels 0,2
189197
sumitup2lane_AVX512FP16_16f(gtwg3A, &GTWG[2][gtwgIdxA], &GTWG[2][gtwgIdxB]);
190198
GTWG[2][gtwgIdxA] *= normal;
191199
GTWG[2][gtwgIdxB] *= normal;
200+
*/
192201

193-
gtwgIdxA = gtwgIdx+1;
194-
gtwgIdxB = gtwgIdx+3;
202+
int gtwgIdxA2 = gtwgIdx+1;
203+
int gtwgIdxB2 = gtwgIdx+3;
195204
// gtwg0 for pixels 1,3
196-
sumitup2lane_AVX512FP16_16f(gtwg0B, &GTWG[0][gtwgIdxA], &GTWG[0][gtwgIdxB]);
205+
/*
206+
sumitup2lane_AVX512FP16_16f(gtwg0A, &GTWG[0][gtwgIdxA], &GTWG[0][gtwgIdxB]);
197207
GTWG[0][gtwgIdxA] *= normal;
198208
GTWG[0][gtwgIdxB] *= normal;
199209
// gtwg1 for pixels 1,3
200-
sumitup2lane_AVX512FP16_16f(gtwg1B, &GTWG[1][gtwgIdxA], &GTWG[1][gtwgIdxB]);
210+
sumitup2lane_AVX512FP16_16f(gtwg1A, &GTWG[1][gtwgIdxA], &GTWG[1][gtwgIdxB]);
201211
GTWG[1][gtwgIdxA] *= normal;
202212
GTWG[1][gtwgIdxB] *= normal;
203213
// gtwg3 for pixels 1,3
204-
sumitup2lane_AVX512FP16_16f(gtwg3B, &GTWG[2][gtwgIdxA], &GTWG[2][gtwgIdxB]);
214+
sumitup2lane_AVX512FP16_16f(gtwg3A, &GTWG[2][gtwgIdxA], &GTWG[2][gtwgIdxB]);
205215
GTWG[2][gtwgIdxA] *= normal;
206216
GTWG[2][gtwgIdxB] *= normal;
217+
*/
218+
GTWG[0][gtwgIdxA2] = GTWG[0][gtwgIdxA];
219+
GTWG[1][gtwgIdxA2] = GTWG[1][gtwgIdxA];
220+
GTWG[2][gtwgIdxA2] = GTWG[2][gtwgIdxA];
221+
GTWG[0][gtwgIdxB2] = GTWG[0][gtwgIdxB];
222+
GTWG[1][gtwgIdxB2] = GTWG[1][gtwgIdxB];
223+
GTWG[2][gtwgIdxB2] = GTWG[2][gtwgIdxB];
207224

208225
return;
209226
}

0 commit comments

Comments
 (0)