Fix the bug that raisr can't work well in some resolutions

xiaoxial · xiaoxial · commit 99f50567c10d · 2022-12-23T10:21:02.000+08:00
Signed-off-by: Xiaoxia Liang &lt;xiaoxia.liang@intel.com&gt;
diff --git a/Library/Raisr.cpp b/Library/Raisr.cpp
@@ -809,7 +809,8 @@ static void CTCountOfBitsChanged_AVX2(float *LRImage, float *HRImage, float *out
 // LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
 // rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
 // cols: stride for buffers in DT type.
-static void CTCountOfBitsChangedSegment_AVX2(float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols)
+// outImageCols: stride for outImage buffer
+static void CTCountOfBitsChangedSegment_AVX2(float *LRImage, float *HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols, const int outImageCols)
 {
     int rowStartOffset = blendingZone.first - startRow;
     int rowEndOffset = blendingZone.second - startRow;
@@ -841,12 +842,12 @@ static void CTCountOfBitsChangedSegment_AVX2(float *LRImage, float *HRImage, con
             // convert 32f to 8bit/10bit
             if (gBitDepth == 8)
             {
-                outImage[(startRow + r) * cols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
+                outImage[(startRow + r) * outImageCols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
             }
             else
             {
                 unsigned short *out = (unsigned short *)outImage;
-                out[(startRow + r) * cols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
+                out[(startRow + r) * outImageCols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
             }
         }
     }
@@ -855,8 +856,9 @@ static void CTCountOfBitsChangedSegment_AVX2(float *LRImage, float *HRImage, con
 // LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.
 // rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.
 // cols: stride for buffers in DT type.
+// outImageCols: stride for outImage buffer
 template <typename DT>
-static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols)
+static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols, const int outImageCols)
 {
     // run census transform on a CTwindowSize * CTwindowSize block centered by [r, c]
     int rowStartOffset = blendingZone.first - startRow;
@@ -883,12 +885,12 @@ static void CTCountOfBitsChangedSegment(DT *LRImage, DT *HRImage, const int rows
             // convert 32f to 8bit/10bit
             if (gBitDepth == 8)
             {
-                outImage[(startRow + r) * cols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
+                outImage[(startRow + r) * outImageCols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
             }
             else
             {
                 unsigned short *out = (unsigned short *)outImage;
-                out[(startRow + r) * cols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
+                out[(startRow + r) * outImageCols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
             }
         }
     }
@@ -1224,9 +1226,12 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
             outY = gIntermediateY; // change the output to VideoDataType gIntermediateY to save the output of 1st pass
         }
 
+        // step is mean line size in a frame, for video the line size should be multiplies of the CPU alignment(16 or 32 bytes), 
+        // the outY->step may greater than or equal to outY->width.
+        // the step of gIppCtx.segZones[passIdx][threadIdx].inYUpscaled is equal to the outY->width 
         const int rows = outY->height;
         const int cols = outY->width;
-        const int step = outY->step;
+        const int step = outY->width;
 
         // 1. Prepare cheap up-scaled 32f data
         IppStatus status = ippStsNoErr;
@@ -1267,8 +1272,8 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
             ippiConvert_8u32f_C1R(pDst, cols,
                                   pSeg32f, cols * sizeof(float), {(int)cols, segRows});
         else
-            ippiConvert_16u32f_C1R((Ipp16u *)pDst, step,
-                                   pSeg32f, cols * sizeof(float), {(int)cols, segRows});
+            ippiConvert_16u32f_C1R((Ipp16u *)pDst, cols,
+                                  pSeg32f, cols * sizeof(float), {(int)cols, segRows});
 
         // 2. Run hashing
         // Update startRow, endRow for hashing algo
@@ -1278,13 +1283,33 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
         // Handle top and bottom borders
         if (startRow == 0)
         {
-            memcpy(outY->pData, pDst, step * gLoopMargin + gLoopMargin);
+            // it needs to do memcpy line by line when the line size of outY->pData is not equal to pDst's line size.
+            if (step == outY->step) {
+                memcpy(outY->pData, pDst, outY->step * gLoopMargin + gLoopMargin);
+            } else {
+                for (int i = 0; i < gLoopMargin; i++) {
+                     memcpy(outY->pData + i * outY->step, pDst + i * step, step);
+                }
+                memcpy(outY->pData + gLoopMargin * outY->step, pDst + gLoopMargin * step, gLoopMargin);
+            }
         }
         if (endRow == rows)
         {
-            memcpy(outY->pData + (rows - gLoopMargin) * step - gLoopMargin,
-                   pDst + (segRows - gLoopMargin) * step - gLoopMargin,
-                   step * gLoopMargin + gLoopMargin);
+            if (step == outY->step) {
+                memcpy(outY->pData + (rows - gLoopMargin) * step - gLoopMargin,
+                       pDst + (segRows - gLoopMargin) * step - gLoopMargin,
+                       outY->step * gLoopMargin + gLoopMargin);
+            } else {
+                memcpy(outY->pData + (rows - gLoopMargin - 1) * outY->step +  outY->width - gLoopMargin,
+                       pDst + (segRows - gLoopMargin) * step - gLoopMargin,
+                       gLoopMargin);
+
+                for (int i = gLoopMargin; i > 0; i--) {
+                     memcpy(outY->pData + (rows - i) * outY->step,
+                            pDst  + (segRows - i) * step,
+                            step);
+                }
+            }
         }
         memcpy(pRaisr32f, pSeg32f, sizeof(float) * cols * segRows);
 
@@ -1353,27 +1378,36 @@ RNLERRORTYPE processSegment(VideoDataType *srcY, VideoDataType *final_outY, Blen
                             val += 0.5; // to round the value
                             if (gBitDepth == 8)
                             {
-                                outY->pData[r * cols + c + pix] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
+                                outY->pData[r * outY->step + c + pix] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));
                             }
                             else
                             {
                                 unsigned short *out = (unsigned short *)outY->pData;
-                                out[r * cols + c + pix] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
+                                out[r * outY->step + c + pix] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));
                             }
                         }
                     }
                 }
             }
             // Copy right border pixels for this row and left border pixels for next row
-            memcpy(outY->pData + r * step - gLoopMargin, pDst + rOffset * step - gLoopMargin, 2 * gLoopMargin);
+            if (step == outY->step) {
+                memcpy(outY->pData + r * step - gLoopMargin, pDst + rOffset * step - gLoopMargin, 2 * gLoopMargin);
+            } else {
+                memcpy(outY->pData + (r -1 ) * outY->step + outY->width - gLoopMargin,
+                       pDst + rOffset * step - gLoopMargin,
+                       gLoopMargin);
+                memcpy(outY->pData + r * outY->step,
+                       pDst + rOffset * step,
+                       gLoopMargin);
+            }
         }
         // 3. Run CT-Blending
         if (blendingMode == CountOfBitsChanged)
         {
             int segStart = gIppCtx.segZones[passIdx][threadIdx].scaleStartRow;
-            CTCountOfBitsChangedSegment<float>(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[passIdx][threadIdx].blendingStartRow, gIppCtx.segZones[passIdx][threadIdx].blendingEndRow}, outY->pData, cols);
+            CTCountOfBitsChangedSegment<float>(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[passIdx][threadIdx].blendingStartRow, gIppCtx.segZones[passIdx][threadIdx].blendingEndRow}, outY->pData, cols, outY->step);
             // No improve with AVX2
-            // CTCountOfBitsChangedSegment_AVX2(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[threadIdx].blendingStartRow, gIppCtx.segZones[threadIdx].blendingEndRow}, outY->pData, cols);
+            // CTCountOfBitsChangedSegment_AVX2(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[threadIdx].blendingStartRow, gIppCtx.segZones[threadIdx].blendingEndRow}, outY->pData, cols, outY->step);
         }
 
         threadStatus[threadIdx] = 1;

Original file line number	Diff line number	Diff line change
`@@ -809,7 +809,8 @@ static void CTCountOfBitsChanged_AVX2(float LRImage, float HRImage, float *out`
`809`	`809`	`// LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.`
`810`	`810`	`// rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.`
`811`	`811`	`// cols: stride for buffers in DT type.`
`812`		`-static void CTCountOfBitsChangedSegment_AVX2(float LRImage, float HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols)`
	`812`	`+// outImageCols: stride for outImage buffer`
	`813`	`+static void CTCountOfBitsChangedSegment_AVX2(float LRImage, float HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols, const int outImageCols)`
`813`	`814`	`{`
`814`	`815`	`int rowStartOffset = blendingZone.first - startRow;`
`815`	`816`	`int rowEndOffset = blendingZone.second - startRow;`
`@@ -841,12 +842,12 @@ static void CTCountOfBitsChangedSegment_AVX2(float LRImage, float HRImage, con`
`841`	`842`	`// convert 32f to 8bit/10bit`
`842`	`843`	`if (gBitDepth == 8)`
`843`	`844`	`{`
`844`		`- outImage[(startRow + r) * cols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));`
	`845`	`+ outImage[(startRow + r) * outImageCols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));`
`845`	`846`	`}`
`846`	`847`	`else`
`847`	`848`	`{`
`848`	`849`	`unsigned short out = (unsigned short )outImage;`
`849`		`- out[(startRow + r) * cols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));`
	`850`	`+ out[(startRow + r) * outImageCols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));`
`850`	`851`	`}`
`851`	`852`	`}`
`852`	`853`	`}`
`@@ -855,8 +856,9 @@ static void CTCountOfBitsChangedSegment_AVX2(float LRImage, float HRImage, con`
`855`	`856`	`// LRImage: cheap up scaled. HRImage: RAISR refined. outImage: output buffer in 8u.`
`856`	`857`	`// rows: rows of LRImage/HRImage. startRow: seg start row. blendingZone: zone to run blending.`
`857`	`858`	`// cols: stride for buffers in DT type.`
	`859`	`+// outImageCols: stride for outImage buffer`
`858`	`860`	`template <typename DT>`
`859`		`-static void CTCountOfBitsChangedSegment(DT LRImage, DT HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols)`
	`861`	`+static void CTCountOfBitsChangedSegment(DT LRImage, DT HRImage, const int rows, const int startRow, const std::pair<int, int> blendingZone, unsigned char *outImage, const int cols, const int outImageCols)`
`860`	`862`	`{`
`861`	`863`	`// run census transform on a CTwindowSize * CTwindowSize block centered by [r, c]`
`862`	`864`	`int rowStartOffset = blendingZone.first - startRow;`
`@@ -883,12 +885,12 @@ static void CTCountOfBitsChangedSegment(DT LRImage, DT HRImage, const int rows`
`883`	`885`	`// convert 32f to 8bit/10bit`
`884`	`886`	`if (gBitDepth == 8)`
`885`	`887`	`{`
`886`		`- outImage[(startRow + r) * cols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));`
	`888`	`+ outImage[(startRow + r) * outImageCols + c] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));`
`887`	`889`	`}`
`888`	`890`	`else`
`889`	`891`	`{`
`890`	`892`	`unsigned short out = (unsigned short )outImage;`
`891`		`- out[(startRow + r) * cols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));`
	`893`	`+ out[(startRow + r) * outImageCols + c] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));`
`892`	`894`	`}`
`893`	`895`	`}`
`894`	`896`	`}`
`@@ -1224,9 +1226,12 @@ RNLERRORTYPE processSegment(VideoDataType srcY, VideoDataType final_outY, Blen`
`1224`	`1226`	`outY = gIntermediateY; // change the output to VideoDataType gIntermediateY to save the output of 1st pass`
`1225`	`1227`	`}`
`1226`	`1228`
	`1229`	`+ // step is mean line size in a frame, for video the line size should be multiplies of the CPU alignment(16 or 32 bytes),`
	`1230`	`+ // the outY->step may greater than or equal to outY->width.`
	`1231`	`+ // the step of gIppCtx.segZones[passIdx][threadIdx].inYUpscaled is equal to the outY->width`
`1227`	`1232`	`const int rows = outY->height;`
`1228`	`1233`	`const int cols = outY->width;`
`1229`		`- const int step = outY->step;`
	`1234`	`+ const int step = outY->width;`
`1230`	`1235`
`1231`	`1236`	`// 1. Prepare cheap up-scaled 32f data`
`1232`	`1237`	`IppStatus status = ippStsNoErr;`
`@@ -1267,8 +1272,8 @@ RNLERRORTYPE processSegment(VideoDataType srcY, VideoDataType final_outY, Blen`
`1267`	`1272`	`ippiConvert_8u32f_C1R(pDst, cols,`
`1268`	`1273`	`pSeg32f, cols * sizeof(float), {(int)cols, segRows});`
`1269`	`1274`	`else`
`1270`		`- ippiConvert_16u32f_C1R((Ipp16u *)pDst, step,`
`1271`		`- pSeg32f, cols * sizeof(float), {(int)cols, segRows});`
	`1275`	`+ ippiConvert_16u32f_C1R((Ipp16u *)pDst, cols,`
	`1276`	`+ pSeg32f, cols * sizeof(float), {(int)cols, segRows});`
`1272`	`1277`
`1273`	`1278`	`// 2. Run hashing`
`1274`	`1279`	`// Update startRow, endRow for hashing algo`
`@@ -1278,13 +1283,33 @@ RNLERRORTYPE processSegment(VideoDataType srcY, VideoDataType final_outY, Blen`
`1278`	`1283`	`// Handle top and bottom borders`
`1279`	`1284`	`if (startRow == 0)`
`1280`	`1285`	`{`
`1281`		`- memcpy(outY->pData, pDst, step * gLoopMargin + gLoopMargin);`
	`1286`	`+ // it needs to do memcpy line by line when the line size of outY->pData is not equal to pDst's line size.`
	`1287`	`+ if (step == outY->step) {`
	`1288`	`+ memcpy(outY->pData, pDst, outY->step * gLoopMargin + gLoopMargin);`
	`1289`	`+ } else {`
	`1290`	`+ for (int i = 0; i < gLoopMargin; i++) {`
	`1291`	`+ memcpy(outY->pData + i * outY->step, pDst + i * step, step);`
	`1292`	`+ }`
	`1293`	`+ memcpy(outY->pData + gLoopMargin * outY->step, pDst + gLoopMargin * step, gLoopMargin);`
	`1294`	`+ }`
`1282`	`1295`	`}`
`1283`	`1296`	`if (endRow == rows)`
`1284`	`1297`	`{`
`1285`		`- memcpy(outY->pData + (rows - gLoopMargin) * step - gLoopMargin,`
`1286`		`- pDst + (segRows - gLoopMargin) * step - gLoopMargin,`
`1287`		`- step * gLoopMargin + gLoopMargin);`
	`1298`	`+ if (step == outY->step) {`
	`1299`	`+ memcpy(outY->pData + (rows - gLoopMargin) * step - gLoopMargin,`
	`1300`	`+ pDst + (segRows - gLoopMargin) * step - gLoopMargin,`
	`1301`	`+ outY->step * gLoopMargin + gLoopMargin);`
	`1302`	`+ } else {`
	`1303`	`+ memcpy(outY->pData + (rows - gLoopMargin - 1) * outY->step + outY->width - gLoopMargin,`
	`1304`	`+ pDst + (segRows - gLoopMargin) * step - gLoopMargin,`
	`1305`	`+ gLoopMargin);`
	`1306`	`+`
	`1307`	`+ for (int i = gLoopMargin; i > 0; i--) {`
	`1308`	`+ memcpy(outY->pData + (rows - i) * outY->step,`
	`1309`	`+ pDst + (segRows - i) * step,`
	`1310`	`+ step);`
	`1311`	`+ }`
	`1312`	`+ }`
`1288`	`1313`	`}`
`1289`	`1314`	`memcpy(pRaisr32f, pSeg32f, sizeof(float) * cols * segRows);`
`1290`	`1315`
`@@ -1353,27 +1378,36 @@ RNLERRORTYPE processSegment(VideoDataType srcY, VideoDataType final_outY, Blen`
`1353`	`1378`	`val += 0.5; // to round the value`
`1354`	`1379`	`if (gBitDepth == 8)`
`1355`	`1380`	`{`
`1356`		`- outY->pData[r * cols + c + pix] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));`
	`1381`	`+ outY->pData[r * outY->step + c + pix] = (unsigned char)(val < gMin8bit ? gMin8bit : (val > gMax8bit ? gMax8bit : val));`
`1357`	`1382`	`}`
`1358`	`1383`	`else`
`1359`	`1384`	`{`
`1360`	`1385`	`unsigned short out = (unsigned short )outY->pData;`
`1361`		`- out[r * cols + c + pix] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));`
	`1386`	`+ out[r * outY->step + c + pix] = (unsigned short)(val < gMin16bit ? gMin16bit : (val > gMax16bit ? gMax16bit : val));`
`1362`	`1387`	`}`
`1363`	`1388`	`}`
`1364`	`1389`	`}`
`1365`	`1390`	`}`
`1366`	`1391`	`}`
`1367`	`1392`	`// Copy right border pixels for this row and left border pixels for next row`
`1368`		`- memcpy(outY->pData + r * step - gLoopMargin, pDst + rOffset * step - gLoopMargin, 2 * gLoopMargin);`
	`1393`	`+ if (step == outY->step) {`
	`1394`	`+ memcpy(outY->pData + r * step - gLoopMargin, pDst + rOffset * step - gLoopMargin, 2 * gLoopMargin);`
	`1395`	`+ } else {`
	`1396`	`+ memcpy(outY->pData + (r -1 ) * outY->step + outY->width - gLoopMargin,`
	`1397`	`+ pDst + rOffset * step - gLoopMargin,`
	`1398`	`+ gLoopMargin);`
	`1399`	`+ memcpy(outY->pData + r * outY->step,`
	`1400`	`+ pDst + rOffset * step,`
	`1401`	`+ gLoopMargin);`
	`1402`	`+ }`
`1369`	`1403`	`}`
`1370`	`1404`	`// 3. Run CT-Blending`
`1371`	`1405`	`if (blendingMode == CountOfBitsChanged)`
`1372`	`1406`	`{`
`1373`	`1407`	`int segStart = gIppCtx.segZones[passIdx][threadIdx].scaleStartRow;`
`1374`		`- CTCountOfBitsChangedSegment<float>(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[passIdx][threadIdx].blendingStartRow, gIppCtx.segZones[passIdx][threadIdx].blendingEndRow}, outY->pData, cols);`
	`1408`	`+ CTCountOfBitsChangedSegment<float>(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[passIdx][threadIdx].blendingStartRow, gIppCtx.segZones[passIdx][threadIdx].blendingEndRow}, outY->pData, cols, outY->step);`
`1375`	`1409`	`// No improve with AVX2`
`1376`		`- // CTCountOfBitsChangedSegment_AVX2(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[threadIdx].blendingStartRow, gIppCtx.segZones[threadIdx].blendingEndRow}, outY->pData, cols);`
	`1410`	`+ // CTCountOfBitsChangedSegment_AVX2(pSeg32f, pRaisr32f, segRows, segStart, {gIppCtx.segZones[threadIdx].blendingStartRow, gIppCtx.segZones[threadIdx].blendingEndRow}, outY->pData, cols, outY->step);`
`1377`	`1411`	`}`
`1378`	`1412`
`1379`	`1413`	`threadStatus[threadIdx] = 1;`