@@ -809,7 +809,8 @@ static void CTCountOfBitsChanged_AVX2(float *LRImage, float *HRImage, float *out
809809// LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
810810// rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
811811// cols: stride for buffers in DT type.
812- static void CTCountOfBitsChangedSegment_AVX2 (float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int , int > blendingZone, unsigned char *outImage, const int cols)
812+ // outImageCols: stride for outImage buffer
813+ static void CTCountOfBitsChangedSegment_AVX2 (float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int , int > blendingZone, unsigned char *outImage, const int cols, const int outImageCols)
813814{
814815 int rowStartOffset = blendingZone.first - startRow;
815816 int rowEndOffset = blendingZone.second - startRow;
@@ -841,12 +842,12 @@ static void CTCountOfBitsChangedSegment_AVX2(float *LRImage, float *HRImage, con
841842 // convert 32f to 8bit/10bit
842843 if (gBitDepth == 8 )
843844 {
844- outImage[(startRow + r) * cols + c] = (unsigned char )(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
845+ outImage[(startRow + r) * outImageCols + c] = (unsigned char )(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
845846 }
846847 else
847848 {
848849 unsigned short *out = (unsigned short *)outImage;
849- out[(startRow + r) * cols + c] = (unsigned short )(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
850+ out[(startRow + r) * outImageCols + c] = (unsigned short )(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
850851 }
851852 }
852853 }
@@ -855,8 +856,9 @@ static void CTCountOfBitsChangedSegment_AVX2(float *LRImage, float *HRImage, con
855856// LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
856857// rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
857858// cols: stride for buffers in DT type.
859+ // outImageCols: stride for outImage buffer
858860template <typename DT>
859- static void CTCountOfBitsChangedSegment (DT *LRImage, DT *HRImage, const int rows, const int startRow, const std::pair<int , int > blendingZone, unsigned char *outImage, const int cols)
861+ static void CTCountOfBitsChangedSegment (DT *LRImage, DT *HRImage, const int rows, const int startRow, const std::pair<int , int > blendingZone, unsigned char *outImage, const int cols, const int outImageCols )
860862{
861863 // run census transform on a CTwindowSize * CTwindowSize block centered by [r, c]
862864 int rowStartOffset = blendingZone.first - startRow;
@@ -883,12 +885,12 @@ static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows
883885 // convert 32f to 8bit/10bit
884886 if (gBitDepth == 8 )
885887 {
886- outImage[(startRow + r) * cols + c] = (unsigned char )(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
888+ outImage[(startRow + r) * outImageCols + c] = (unsigned char )(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
887889 }
888890 else
889891 {
890892 unsigned short *out = (unsigned short *)outImage;
891- out[(startRow + r) * cols + c] = (unsigned short )(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
893+ out[(startRow + r) * outImageCols + c] = (unsigned short )(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
892894 }
893895 }
894896 }
@@ -1224,9 +1226,12 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
12241226 outY = gIntermediateY ; // change the output to VideoDataType gIntermediateY to save the output of 1st pass
12251227 }
12261228
1229+ // step is mean line size in a frame, for video the line size should be multiplies of the CPU alignment(16 or 32 bytes),
1230+ // the outY->step may greater than or equal to outY->width.
1231+ // the step of gIppCtx.segZones[passIdx][threadIdx].inYUpscaled is equal to the outY->width
12271232 const int rows = outY->height ;
12281233 const int cols = outY->width ;
1229- const int step = outY->step ;
1234+ const int step = outY->width ;
12301235
12311236 // 1. Prepare cheap up-scaled 32f data
12321237 IppStatus status = ippStsNoErr;
@@ -1267,8 +1272,8 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
12671272 ippiConvert_8u32f_C1R (pDst, cols,
12681273 pSeg32f, cols * sizeof (float ), {(int )cols, segRows});
12691274 else
1270- ippiConvert_16u32f_C1R ((Ipp16u *)pDst, step ,
1271- pSeg32f, cols * sizeof (float ), {(int )cols, segRows});
1275+ ippiConvert_16u32f_C1R ((Ipp16u *)pDst, cols ,
1276+ pSeg32f, cols * sizeof (float ), {(int )cols, segRows});
12721277
12731278 // 2. Run hashing
12741279 // Update startRow, endRow for hashing algo
@@ -1278,13 +1283,33 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
12781283 // Handle top and bottom borders
12791284 if (startRow == 0 )
12801285 {
1281- memcpy (outY->pData , pDst, step * gLoopMargin + gLoopMargin );
1286+ // it needs to do memcpy line by line when the line size of outY->pData is not equal to pDst's line size.
1287+ if (step == outY->step ) {
1288+ memcpy (outY->pData , pDst, outY->step * gLoopMargin + gLoopMargin );
1289+ } else {
1290+ for (int i = 0 ; i < gLoopMargin ; i++) {
1291+ memcpy (outY->pData + i * outY->step , pDst + i * step, step);
1292+ }
1293+ memcpy (outY->pData + gLoopMargin * outY->step , pDst + gLoopMargin * step, gLoopMargin );
1294+ }
12821295 }
12831296 if (endRow == rows)
12841297 {
1285- memcpy (outY->pData + (rows - gLoopMargin ) * step - gLoopMargin ,
1286- pDst + (segRows - gLoopMargin ) * step - gLoopMargin ,
1287- step * gLoopMargin + gLoopMargin );
1298+ if (step == outY->step ) {
1299+ memcpy (outY->pData + (rows - gLoopMargin ) * step - gLoopMargin ,
1300+ pDst + (segRows - gLoopMargin ) * step - gLoopMargin ,
1301+ outY->step * gLoopMargin + gLoopMargin );
1302+ } else {
1303+ memcpy (outY->pData + (rows - gLoopMargin - 1 ) * outY->step + outY->width - gLoopMargin ,
1304+ pDst + (segRows - gLoopMargin ) * step - gLoopMargin ,
1305+ gLoopMargin );
1306+
1307+ for (int i = gLoopMargin ; i > 0 ; i--) {
1308+ memcpy (outY->pData + (rows - i) * outY->step ,
1309+ pDst + (segRows - i) * step,
1310+ step);
1311+ }
1312+ }
12881313 }
12891314 memcpy (pRaisr32f, pSeg32f, sizeof (float ) * cols * segRows);
12901315
@@ -1353,27 +1378,36 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
13531378 val += 0.5 ; // to round the value
13541379 if (gBitDepth == 8 )
13551380 {
1356- outY->pData [r * cols + c + pix] = (unsigned char )(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
1381+ outY->pData [r * outY-> step + c + pix] = (unsigned char )(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
13571382 }
13581383 else
13591384 {
13601385 unsigned short *out = (unsigned short *)outY->pData ;
1361- out[r * cols + c + pix] = (unsigned short )(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
1386+ out[r * outY-> step + c + pix] = (unsigned short )(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
13621387 }
13631388 }
13641389 }
13651390 }
13661391 }
13671392 // Copy right border pixels for this row and left border pixels for next row
1368- memcpy (outY->pData + r * step - gLoopMargin , pDst + rOffset * step - gLoopMargin , 2 * gLoopMargin );
1393+ if (step == outY->step ) {
1394+ memcpy (outY->pData + r * step - gLoopMargin , pDst + rOffset * step - gLoopMargin , 2 * gLoopMargin );
1395+ } else {
1396+ memcpy (outY->pData + (r -1 ) * outY->step + outY->width - gLoopMargin ,
1397+ pDst + rOffset * step - gLoopMargin ,
1398+ gLoopMargin );
1399+ memcpy (outY->pData + r * outY->step ,
1400+ pDst + rOffset * step,
1401+ gLoopMargin );
1402+ }
13691403 }
13701404 // 3. Run CT-Blending
13711405 if (blendingMode == CountOfBitsChanged)
13721406 {
13731407 int segStart = gIppCtx .segZones [passIdx][threadIdx].scaleStartRow ;
1374- CTCountOfBitsChangedSegment<float >(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx .segZones [passIdx][threadIdx].blendingStartRow , gIppCtx .segZones [passIdx][threadIdx].blendingEndRow }, outY->pData , cols);
1408+ CTCountOfBitsChangedSegment<float >(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx .segZones [passIdx][threadIdx].blendingStartRow , gIppCtx .segZones [passIdx][threadIdx].blendingEndRow }, outY->pData , cols, outY-> step );
13751409 // No improve with AVX2
1376- // CTCountOfBitsChangedSegment_AVX2(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[threadIdx].blendingStartRow, gIppCtx.segZones[threadIdx].blendingEndRow}, outY->pData, cols);
1410+ // CTCountOfBitsChangedSegment_AVX2(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[threadIdx].blendingStartRow, gIppCtx.segZones[threadIdx].blendingEndRow}, outY->pData, cols, outY->step );
13771411 }
13781412
13791413 threadStatus[threadIdx] = 1 ;
0 commit comments