Skip to content

Commit 6968098

Browse files
cabirdmexiaoxial
authored andcommitted
Revert "added bug back in. eliminates 5/6 GetGTWGs to improve perf by ~40%. Impact to VQ appears minimal."
This reverts commit d9ebf4e4fc7ccd95052061aa1ec74fa14d34e41f.
1 parent c918906 commit 6968098

1 file changed

Lines changed: 8 additions & 25 deletions

File tree

Library/Raisr_AVX512FP16.cpp

Lines changed: 8 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ void computeGTWG_Segment_AVX512FP16_16f(const _Float16 *img, const int nrows, co
150150
// process patchSize rows
151151
// load next row
152152
p1 += ncols;
153-
154153
__m512h c = _mm512_zextph256_ph512(_mm256_loadu_ph(p1));
155154
c = _mm512_castps_ph(_mm512_insertf32x8(_mm512_castph_ps(c), _mm256_castph_ps(_mm512_castph512_ph256(c)), 1)); // duplicate high & low to compute GTWG for 2 pixels
156155
__m512h w = _mm512_loadu_ph(gGaussian2DOriginal_fp16_doubled_w1w3[i]); // pixels 1,3
@@ -159,15 +158,14 @@ void computeGTWG_Segment_AVX512FP16_16f(const _Float16 *img, const int nrows, co
159158
const __m512h gyi = GetGy_AVX512FP16(b);
160159

161160
gtwg0A = GetGTWG_AVX512FP16(gtwg0A, gxi, w, gxi);
162-
// gtwg1A = GetGTWG_AVX512FP16(gtwg1A, gxi, w, gyi);
163-
// gtwg3A = GetGTWG_AVX512FP16(gtwg3A, gyi, w, gyi);
164-
/*
161+
gtwg1A = GetGTWG_AVX512FP16(gtwg1A, gxi, w, gyi);
162+
gtwg3A = GetGTWG_AVX512FP16(gtwg3A, gyi, w, gyi);
163+
165164
w = shiftR_AVX512FP16(w); // pixels 2,4
166165

167166
gtwg0B = GetGTWG_AVX512FP16(gtwg0B, gxi, w, gxi);
168167
gtwg1B = GetGTWG_AVX512FP16(gtwg1B, gxi, w, gyi);
169168
gtwg3B = GetGTWG_AVX512FP16(gtwg3B, gyi, w, gyi);
170-
*/
171169

172170
_mm256_mask_storeu_epi16(buf1 + gPatchSize * i - 1, 0x0ffe,_mm256_castph_si256(_mm512_castph512_ph256(b)));
173171
_mm256_mask_storeu_epi16(buf2 + gPatchSize * i - 2, 0x1ffc,_mm256_castph_si256(_mm512_castph512_ph256(b)));
@@ -182,45 +180,30 @@ void computeGTWG_Segment_AVX512FP16_16f(const _Float16 *img, const int nrows, co
182180
int gtwgIdxB = gtwgIdx+2;
183181
sumitup2lane_AVX512FP16_16f(gtwg0A, &GTWG[0][gtwgIdxA], &GTWG[0][gtwgIdxB]);
184182
GTWG[0][gtwgIdxA] *= normal;
185-
GTWG[1][gtwgIdxA] = GTWG[0][gtwgIdxA];
186-
GTWG[2][gtwgIdxA] = GTWG[0][gtwgIdxA];
187183
GTWG[0][gtwgIdxB] *= normal;
188-
GTWG[1][gtwgIdxB] = GTWG[0][gtwgIdxB];
189-
GTWG[2][gtwgIdxB] = GTWG[0][gtwgIdxB];
190-
191184
// gtwg1 for pixels 0,2
192-
/*
193185
sumitup2lane_AVX512FP16_16f(gtwg1A, &GTWG[1][gtwgIdxA], &GTWG[1][gtwgIdxB]);
194186
GTWG[1][gtwgIdxA] *= normal;
195187
GTWG[1][gtwgIdxB] *= normal;
196188
// gtwg3 for pixels 0,2
197189
sumitup2lane_AVX512FP16_16f(gtwg3A, &GTWG[2][gtwgIdxA], &GTWG[2][gtwgIdxB]);
198190
GTWG[2][gtwgIdxA] *= normal;
199191
GTWG[2][gtwgIdxB] *= normal;
200-
*/
201192

202-
int gtwgIdxA2 = gtwgIdx+1;
203-
int gtwgIdxB2 = gtwgIdx+3;
193+
gtwgIdxA = gtwgIdx+1;
194+
gtwgIdxB = gtwgIdx+3;
204195
// gtwg0 for pixels 1,3
205-
/*
206-
sumitup2lane_AVX512FP16_16f(gtwg0A, &GTWG[0][gtwgIdxA], &GTWG[0][gtwgIdxB]);
196+
sumitup2lane_AVX512FP16_16f(gtwg0B, &GTWG[0][gtwgIdxA], &GTWG[0][gtwgIdxB]);
207197
GTWG[0][gtwgIdxA] *= normal;
208198
GTWG[0][gtwgIdxB] *= normal;
209199
// gtwg1 for pixels 1,3
210-
sumitup2lane_AVX512FP16_16f(gtwg1A, &GTWG[1][gtwgIdxA], &GTWG[1][gtwgIdxB]);
200+
sumitup2lane_AVX512FP16_16f(gtwg1B, &GTWG[1][gtwgIdxA], &GTWG[1][gtwgIdxB]);
211201
GTWG[1][gtwgIdxA] *= normal;
212202
GTWG[1][gtwgIdxB] *= normal;
213203
// gtwg3 for pixels 1,3
214-
sumitup2lane_AVX512FP16_16f(gtwg3A, &GTWG[2][gtwgIdxA], &GTWG[2][gtwgIdxB]);
204+
sumitup2lane_AVX512FP16_16f(gtwg3B, &GTWG[2][gtwgIdxA], &GTWG[2][gtwgIdxB]);
215205
GTWG[2][gtwgIdxA] *= normal;
216206
GTWG[2][gtwgIdxB] *= normal;
217-
*/
218-
GTWG[0][gtwgIdxA2] = GTWG[0][gtwgIdxA];
219-
GTWG[1][gtwgIdxA2] = GTWG[1][gtwgIdxA];
220-
GTWG[2][gtwgIdxA2] = GTWG[2][gtwgIdxA];
221-
GTWG[0][gtwgIdxB2] = GTWG[0][gtwgIdxB];
222-
GTWG[1][gtwgIdxB2] = GTWG[1][gtwgIdxB];
223-
GTWG[2][gtwgIdxB2] = GTWG[2][gtwgIdxB];
224207

225208
return;
226209
}

0 commit comments

Comments
 (0)