@@ -110,34 +110,62 @@ public void FillDestinationPixels(RowInterval rowInterval, Buffer2D<TPixel> dest
110110 {
111111 Span < Vector4 > tempColSpan = this . tempColumnBuffer . GetSpan ( ) ;
112112
113- // When creating transposedFirstPassBuffer, we made sure it's contiguous:
113+ // When creating transposedFirstPassBuffer, we made sure it's contiguous.
114114 Span < Vector4 > transposedFirstPassBufferSpan = this . transposedFirstPassBuffer . DangerousGetSingleSpan ( ) ;
115115
116116 int left = this . targetWorkingRect . Left ;
117- int right = this . targetWorkingRect . Right ;
118117 int width = this . targetWorkingRect . Width ;
118+ nuint widthCount = ( uint ) width ;
119+
120+ // Normalize destination-space Y to kernel indices using uint arithmetic.
121+ // This relies on the contract that processing addresses are normalized (cropping/padding handled by targetOrigin).
122+ int targetOriginY = this . targetOrigin . Y ;
123+
124+ // Hoist invariant calculations outside the loop.
125+ int currentWindowMax = this . currentWindow . Max ;
126+ int currentWindowMin = this . currentWindow . Min ;
127+ nuint workerHeight = ( uint ) this . workerHeight ;
128+ nuint workerHeight2 = workerHeight * 2 ;
129+
130+ // Ref-walk the kernel table to avoid bounds checks in the tight loop.
131+ ReadOnlySpan < ResizeKernel > vKernels = this . verticalKernelMap . GetKernelSpan ( ) ;
132+ ref ResizeKernel vKernelBase = ref MemoryMarshal . GetReference ( vKernels ) ;
133+
134+ ref Vector4 tempRowBase = ref MemoryMarshal . GetReference ( tempColSpan ) ;
135+
119136 for ( int y = rowInterval . Min ; y < rowInterval . Max ; y ++ )
120137 {
121- // Ensure offsets are normalized for cropping and padding.
122- ResizeKernel kernel = this . verticalKernelMap . GetKernel ( ( uint ) ( y - this . targetOrigin . Y ) ) ;
138+ // Normalize destination-space Y to an unsigned kernel index.
139+ uint vIdx = ( uint ) ( y - targetOriginY ) ;
140+ ref ResizeKernel kernel = ref Unsafe . Add ( ref vKernelBase , ( nint ) vIdx ) ;
123141
124- while ( kernel . StartIndex + kernel . Length > this . currentWindow . Max )
142+ // Slide the working window when the kernel would read beyond the current cached region.
143+ int kernelEnd = kernel . StartIndex + kernel . Length ;
144+ while ( kernelEnd > currentWindowMax )
125145 {
126146 this . Slide ( ) ;
147+ currentWindowMax = this . currentWindow . Max ;
148+ currentWindowMin = this . currentWindow . Min ;
127149 }
128150
129- ref Vector4 tempRowBase = ref MemoryMarshal . GetReference ( tempColSpan ) ;
151+ int top = kernel . StartIndex - currentWindowMin ;
152+ ref Vector4 colRef0 = ref transposedFirstPassBufferSpan [ top ] ;
130153
131- int top = kernel . StartIndex - this . currentWindow . Min ;
154+ // Unroll by 2 and advance column refs via arithmetic to reduce inner-loop overhead.
155+ nuint i = 0 ;
156+ for ( ; i + 1 < widthCount ; i += 2 )
157+ {
158+ ref Vector4 colRef1 = ref Unsafe . Add ( ref colRef0 , workerHeight ) ;
132159
133- ref Vector4 fpBase = ref transposedFirstPassBufferSpan [ top ] ;
160+ Unsafe . Add ( ref tempRowBase , i ) = kernel. ConvolveCore ( ref colRef0 ) ;
161+ Unsafe . Add ( ref tempRowBase , i + 1 ) = kernel. ConvolveCore ( ref colRef1 ) ;
134162
135- for ( nuint x = 0 ; x < ( uint ) ( right - left ) ; x ++ )
136- {
137- ref Vector4 firstPassColumnBase = ref Unsafe . Add ( ref fpBase , x * ( uint ) this . workerHeight ) ;
163+ colRef0 = ref Unsafe . Add ( ref colRef0 , workerHeight2 ) ;
164+ }
138165
139- // Destination color components
140- Unsafe . Add ( ref tempRowBase , x ) = kernel. ConvolveCore ( ref firstPassColumnBase ) ;
166+ if ( i < widthCount )
167+ {
168+ Unsafe . Add ( ref tempRowBase , i ) = kernel. ConvolveCore ( ref colRef0 ) ;
141169 }
142170
143171 Span < TPixel > targetRowSpan = destination . DangerousGetRowSpan ( y ) . Slice ( left , width ) ;
@@ -171,7 +199,19 @@ private void CalculateFirstPassValues(RowInterval calculationInterval)
171199
172200 nuint left = ( uint ) this . targetWorkingRect . Left ;
173201 nuint right = ( uint ) this . targetWorkingRect . Right ;
202+ nuint widthCount = right - left ;
203+
204+ // Normalize destination-space X to kernel indices using uint arithmetic.
205+ // This relies on the contract that processing addresses are normalized (cropping/padding handled by targetOrigin).
174206 nuint targetOriginX = ( uint ) this . targetOrigin . X ;
207+
208+ nuint workerHeight = ( uint ) this . workerHeight ;
209+ int currentWindowMin = this . currentWindow . Min ;
210+
211+ // Ref-walk the kernel table to avoid bounds checks in the tight loop.
212+ ReadOnlySpan < ResizeKernel > hKernels = this . horizontalKernelMap . GetKernelSpan ( ) ;
213+ ref ResizeKernel hKernelBase = ref MemoryMarshal . GetReference ( hKernels ) ;
214+
175215 for ( int y = calculationInterval . Min ; y < calculationInterval . Max ; y ++ )
176216 {
177217 Span < TPixel > sourceRow = this . source . DangerousGetRowSpan ( y ) ;
@@ -182,17 +222,30 @@ private void CalculateFirstPassValues(RowInterval calculationInterval)
182222 tempRowSpan ,
183223 this . conversionModifiers ) ;
184224
185- // optimization for:
186- // Span<Vector4> firstPassSpan = transposedFirstPassBufferSpan.Slice(y - this.currentWindow.Min);
187- ref Vector4 firstPassBaseRef = ref transposedFirstPassBufferSpan [ y - this . currentWindow . Min ] ;
225+ ref Vector4 firstPassBaseRef = ref transposedFirstPassBufferSpan [ y - currentWindowMin ] ;
226+
227+ // Unroll by 2 to reduce loop and kernel lookup overhead.
228+ nuint x = left ;
229+ nuint z = 0 ;
230+
231+ for ( ; z + 1 < widthCount ; x += 2 , z += 2 )
232+ {
233+ nuint hIdx0 = ( uint ) ( x - targetOriginX ) ;
234+ nuint hIdx1 = ( uint ) ( ( x + 1 ) - targetOriginX ) ;
235+
236+ ref ResizeKernel kernel0 = ref Unsafe . Add ( ref hKernelBase , ( nint ) hIdx0 ) ;
237+ ref ResizeKernel kernel1 = ref Unsafe . Add ( ref hKernelBase , ( nint ) hIdx1 ) ;
238+
239+ Unsafe . Add ( ref firstPassBaseRef , z * workerHeight ) = kernel0. Convolve ( tempRowSpan ) ;
240+ Unsafe . Add ( ref firstPassBaseRef , ( z + 1 ) * workerHeight ) = kernel1. Convolve ( tempRowSpan ) ;
241+ }
188242
189- for ( nuint x = left , z = 0 ; x < right ; x ++ , z ++ )
243+ if ( z < widthCount )
190244 {
191- ResizeKernel kernel = this . horizontalKernelMap . GetKernel ( x - targetOriginX ) ;
245+ nuint hIdx = ( uint ) ( x - targetOriginX ) ;
246+ ref ResizeKernel kernel = ref Unsafe . Add ( ref hKernelBase , ( nint ) hIdx ) ;
192247
193- // optimization for:
194- // firstPassSpan[x * this.workerHeight] = kernel.Convolve(tempRowSpan);
195- Unsafe . Add ( ref firstPassBaseRef , z * ( uint ) this . workerHeight ) = kernel. Convolve ( tempRowSpan ) ;
248+ Unsafe . Add ( ref firstPassBaseRef , z * workerHeight ) = kernel. Convolve ( tempRowSpan ) ;
196249 }
197250 }
198251 }
0 commit comments