88use Codewithkyrian \Transformers \Utils \Image ;
99use Codewithkyrian \Transformers \Utils \Tensor ;
1010use Imagine \Image \Point ;
11+ use Interop \Polite \Math \Matrix \NDArray ;
1112use function Codewithkyrian \Transformers \Utils \timeUsage ;
1213
1314class ImageFeatureExtractor extends FeatureExtractor
@@ -98,37 +99,6 @@ public function __construct(public array $config)
9899 }
99100 }
100101
101- /**
102- * Resize the image to make a thumbnail.
103- * @param Image $image The image to be resized.
104- * @param array{height: int, width: int} $size The size ['height' => h, 'width' => w] to resize the image to.
105- * @param int $resample The resampling filter to use.
106- * @return Image The resized image.
107- */
108- public function thumbnail (Image $ image , array $ size , int $ resample = 2 ): Image
109- {
110- $ inputHeight = $ image ->height ();
111- $ inputWidth = $ image ->width ();
112-
113- $ outputHeight = $ size ['height ' ];
114- $ outputWidth = $ size ['width ' ];
115-
116- // We always resize to the smallest of either the input or output size.
117- $ height = min ($ inputHeight , $ outputHeight );
118- $ width = min ($ inputWidth , $ outputWidth );
119-
120- if ($ height === $ inputHeight && $ width === $ inputWidth ) {
121- return $ image ;
122- }
123-
124- if ($ inputHeight > $ inputWidth ) {
125- $ width = floor ($ inputWidth * $ height / $ inputHeight );
126- } elseif ($ inputWidth > $ inputHeight ) {
127- $ height = floor ($ inputHeight * $ width / $ inputWidth );
128- }
129-
130- return $ image ->resize ($ width , $ height , $ resample );
131- }
132102
133103 /**
134104 * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold).
@@ -184,24 +154,23 @@ public function cropMargin(Image $image, int $grayThreshold = 200): static
184154
185155 /**
186156 * Pad the image by a certain amount.
187- * @param array $pixelData The pixel data to pad.
188- * @param int[] $imgShape The dimensions of the image (height, width, channels).
157+ * @param Tensor $imageTensor The pixel data to pad.
189158 * @param int[]|int $padSize The dimensions of the padded image.
190159 * @param string $mode The type of padding to add.
191160 * @param bool $center Whether to center the image.
192161 * @param int $constantValues The constant value to use for padding.
193- * @return array{0: array, 1: int[]} The padded pixel data and image dimensions.
162+ * @return Tensor The padded pixel data and image dimensions.
163+ * @throws \Exception
194164 */
195165 public function padImage (
196- array $ pixelData ,
197- array $ imgShape ,
166+ Tensor $ imageTensor ,
198167 int |array $ padSize ,
199168 string $ mode = 'constant ' ,
200169 bool $ center = false ,
201170 int $ constantValues = 0
202- ): array
171+ ): Tensor
203172 {
204- [$ imageHeight , $ imageWidth , $ imageChannels ] = $ imgShape ;
173+ [$ imageHeight , $ imageWidth , $ imageChannels ] = $ imageTensor -> shape () ;
205174
206175 if (is_array ($ padSize )) {
207176 $ paddedImageWidth = $ padSize ['width ' ];
@@ -213,17 +182,18 @@ public function padImage(
213182
214183 // Only add padding if there is a difference in size
215184 if ($ paddedImageWidth !== $ imageWidth || $ paddedImageHeight !== $ imageHeight ) {
216-
217- $ paddedPixelData = array_fill (0 , $ paddedImageWidth * $ paddedImageHeight * $ imageChannels , 0 );
185+ $ paddedShape = [$ paddedImageWidth , $ paddedImageHeight , $ imageChannels ];
218186
219187 if (is_array ($ constantValues )) {
188+ $ paddedPixelData = Tensor::fill ($ paddedShape , 0 );
189+
220190 // Fill with constant values, cycling through the array
221191 $ constantValuesLength = count ($ constantValues );
222- for ($ i = 0 ; $ i < count ( $ paddedPixelData ); ++$ i ) {
223- $ paddedPixelData [$ i ] = $ constantValues [$ i % $ constantValuesLength ];
192+ for ($ i = 0 ; $ i < $ paddedPixelData-> size ( ); ++$ i ) {
193+ $ paddedPixelData-> buffer () [$ i ] = $ constantValues [$ i % $ constantValuesLength ];
224194 }
225- } else if ( $ constantValues !== 0 ) {
226- $ paddedPixelData = array_fill ( 0 , $ paddedImageWidth * $ paddedImageHeight * $ imageChannels , $ constantValues );
195+ } else {
196+ $ paddedPixelData = Tensor:: fill ( $ paddedShape , $ constantValues );
227197 }
228198
229199 [$ left , $ top ] = $ center ?
@@ -240,7 +210,7 @@ public function padImage(
240210 $ d = ($ b + $ j ) * $ imageChannels ;
241211
242212 for ($ k = 0 ; $ k < $ imageChannels ; ++$ k ) {
243- $ paddedPixelData [$ c + $ k ] = $ pixelData [$ d + $ k ];
213+ $ paddedPixelData-> buffer () [$ c + $ k ] = $ imageTensor -> buffer () [$ d + $ k ];
244214 }
245215 }
246216 }
@@ -264,18 +234,17 @@ public function padImage(
264234
265235 // Copy channel-wise
266236 for ($ k = 0 ; $ k < $ imageChannels ; ++$ k ) {
267- $ paddedPixelData [$ c + $ k ] = $ pixelData [$ d + $ k ];
237+ $ paddedPixelData-> buffer () [$ c + $ k ] = $ imageTensor -> buffer () [$ d + $ k ];
268238 }
269239 }
270240 }
271241 }
272242
273243 // Update pixel data and image dimensions
274- $ pixelData = $ paddedPixelData ;
275- $ imgShape = [$ paddedImageHeight , $ paddedImageWidth , $ imageChannels ];
244+ $ imageTensor = $ paddedPixelData ;
276245 }
277246
278- return [ $ pixelData , $ imgShape ] ;
247+ return $ imageTensor ;
279248 }
280249
281250 private function calculateReflectOffset (int $ val , int $ max ): int
@@ -284,17 +253,6 @@ private function calculateReflectOffset(int $val, int $max): int
284253 return $ mod > $ max ? $ max - ($ mod - $ max ) : $ mod ;
285254 }
286255
287- /**
288- * Rescale the image's pixel values by the specified rescale factor.
289- * @param array $pixelData The pixel data to rescale.
290- * @return void
291- */
292- public function rescale (array &$ pixelData ): void
293- {
294- for ($ i = 0 ; $ i < count ($ pixelData ); ++$ i ) {
295- $ pixelData [$ i ] *= $ this ->rescaleFactor ;
296- }
297- }
298256
299257 /**
300258 * Find the target (width, height) dimension of the output image after
@@ -379,17 +337,6 @@ public function getResizeOutputImageSize(Image $image, int|array|null $size): ar
379337 }
380338 }
381339
382- /**
383- * Resizes the image.
384- * @param Image $image The image to resize.
385- * @return Image The resized image.
386- */
387- public function resize (Image $ image ): Image
388- {
389- [$ newWidth , $ newHeight ] = $ this ->getResizeOutputImageSize ($ image , $ this ->size );
390-
391- return $ image ->resize ($ newWidth , $ newHeight , $ this ->resample );
392- }
393340
394341 /**
395342 * Preprocesses the given image.
@@ -417,7 +364,7 @@ public function preprocess(
417364 }
418365
419366
420- [ $ srcWidth , $ srcHeight ] = $ image ->size (); // original image size
367+ $ originalInputSize = $ image ->size (); // original image size
421368
422369 // Convert image to RGB if specified in config.
423370 if ($ doConvertRGB ?? $ this ->doConvertRGB ) {
@@ -426,14 +373,16 @@ public function preprocess(
426373 $ image ->grayscale ();
427374 }
428375
429- // Resize all images
376+ // Resize if specified in config.
430377 if ($ this ->doResize ) {
431- $ this ->resize ($ image );
378+ [$ newWidth , $ newHeight ] = $ this ->getResizeOutputImageSize ($ image , $ this ->size );
379+
380+ $ image ->resize ($ newWidth , $ newHeight , $ this ->resample );
432381 }
433382
434383 // Resize the image using thumbnail method.
435384 if ($ this ->doThumbnail ) {
436- $ this ->thumbnail ($ image , $ this ->size , $ this ->resample );
385+ $ image ->thumbnail ($ this -> size [ ' width ' ] , $ this ->size [ ' height ' ] , $ this ->resample );
437386 }
438387
439388 if ($ this ->doCenterCrop ) {
@@ -449,59 +398,59 @@ public function preprocess(
449398 $ image ->centerCrop ($ cropWidth , $ cropHeight );
450399 }
451400
452- $ reshapedInputSize = [$ image ->height (), $ image ->width ()];
453-
454- // All pixel-level manipulation occurs with data in the hwc format (height, width, channels),
455- // to emulate the behavior of the original Python code (w/ numpy).
456- $ pixelData = $ image ->pixelData ();
401+ $ reshapedInputSize = $ image ->size ();
457402
458- $ imgShape = [ $ image ->height (), $ image -> width (), $ image -> channels ] ;
403+ $ imageTensor = $ image ->toTensor () ;
459404
460405 if ($ this ->doRescale ) {
461- $ this -> rescale ( $ pixelData );
406+ $ imageTensor = $ imageTensor -> multiply ( $ this -> rescaleFactor );
462407 }
463408
464-
465409 if ($ doNormalize ?? $ this ->doNormalize ) {
466- $ imageMean = $ this ->imageMean ;
467- if (!is_array ($ this ->imageMean )) {
468- $ imageMean = array_fill (0 , $ image ->channels , $ this ->imageMean );
410+ if (is_array ($ this ->imageMean )) {
411+ // Negate the mean values to add instead of subtract
412+ $ negatedMean = array_map (fn ($ mean ) => -$ mean , $ this ->imageMean );
413+ $ imageMean = Tensor::repeat ($ negatedMean , $ image ->height () * $ image ->width (), 1 );
414+ } else {
415+ $ imageMean = Tensor::fill ([$ image ->channels * $ image ->height () * $ image ->width ()], -$ this ->imageMean );
469416 }
470417
471- $ imageStd = $ this ->imageStd ;
472- if (!is_array ($ this ->imageStd )) {
473- $ imageStd = array_fill (0 , $ image ->channels , $ this ->imageMean );
418+
419+ if (is_array ($ this ->imageStd )) {
420+ // Inverse the standard deviation values to multiple instead of divide
421+ $ inversedStd = array_map (fn ($ std ) => 1 / $ std , $ this ->imageStd );
422+ $ imageStd = Tensor::repeat ($ inversedStd , $ image ->height () * $ image ->width (), 1 );
423+ } else {
424+ $ imageStd = Tensor::fill ([$ image ->channels * $ image ->height () * $ image ->width ()], 1 / $ this ->imageStd );
474425 }
475426
427+
428+ // Reshape mean and std to match the image tensor shape
429+ $ imageMean = $ imageMean ->reshape ($ imageTensor ->shape ());
430+ $ imageStd = $ imageStd ->reshape ($ imageTensor ->shape ());
431+
476432 if (count ($ imageMean ) !== $ image ->channels || count ($ imageStd ) !== $ image ->channels ) {
477- throw new \Exception ("When set to arrays, the length of `imageMean` ( " . count ($ imageMean ) . ") and `imageStd` ( " . count ($ imageStd ) . ") must match the number of channels in the image ( {$ image ->channels () }). " );
433+ throw new \Exception ("When set to arrays, the length of `imageMean` ( " . count ($ imageMean ) . ") and `imageStd` ( " . count ($ imageStd ) . ") must match the number of channels in the image ( {$ image ->channels }). " );
478434 }
479435
480436 // Normalize pixel data
481- for ($ i = 0 ; $ i < count ($ pixelData ); $ i += $ image ->channels ) {
482- for ($ j = 0 ; $ j < $ image ->channels ; ++$ j ) {
483- $ pixelData [$ i + $ j ] = ($ pixelData [$ i + $ j ] - $ imageMean [$ j ]) / $ imageStd [$ j ];
484- }
485- }
437+ $ imageTensor = $ imageTensor ->add ($ imageMean )->multiply ($ imageStd );
486438 }
487439
488440 // Perform padding after rescaling/normalizing
489441 if ($ doPad ?? $ this ->doPad ) {
490442 if ($ this ->padSize !== null ) {
491- [ $ pixelData , $ imgShape ] = $ this ->padImage ($ pixelData , $ imgShape , $ this ->padSize );
443+ $ imageTensor = $ this ->padImage ($ imageTensor , $ this ->padSize );
492444 } elseif ($ this ->sizeDivisibility !== null ) {
493- [$ paddedWidth , $ paddedHeight ] = $ this ->enforceSizeDivisibility ([$ imgShape [1 ], $ imgShape [0 ]], $ this ->sizeDivisibility );
494- [ $ pixelData , $ imgShape ] = $ this ->padImage ($ pixelData , $ imgShape , ['width ' => $ paddedWidth , 'height ' => $ paddedHeight ]);
445+ [$ paddedWidth , $ paddedHeight ] = $ this ->enforceSizeDivisibility ([$ imageTensor -> shape () [1 ], $ imageTensor -> shape () [0 ]], $ this ->sizeDivisibility );
446+ $ imageTensor = $ this ->padImage ($ imageTensor , ['width ' => $ paddedWidth , 'height ' => $ paddedHeight ]);
495447 }
496448 }
497449
498- // Convert to channel dimension format (hwc -> chw)
499- $ pixelValues = Tensor::fromArray ($ pixelData , Tensor::float32, $ imgShape )->permute (2 , 0 , 1 );
500-
501450 return [
502- 'original_size ' => [ $ srcHeight , $ srcWidth ] ,
451+ 'original_size ' => $ originalInputSize ,
503452 'reshaped_input_size ' => $ reshapedInputSize ,
504- 'pixel_values ' => $ pixelValues ,
453+ 'pixel_values ' => $ imageTensor ,
505454 ];
506455 }
507456
0 commit comments