Move thumbnail method from feature extractor to Image class

CodeWithKyrian · CodeWithKyrian · commit 740a79790040 · 2024-05-01T15:40:35.000+01:00
diff --git a/examples/pipelines/text-generation.php b/examples/pipelines/text-generation.php
@@ -35,13 +35,14 @@
 );
 
 //$generator = pipeline('text-generation', 'Xenova/codegen-350M-mono');
-//$streamer = StdOutStreamer::make($generator->tokenizer);
+//$streamer = StdOutStreamer::make();
 //
 //$output = $generator(
 //    'def fib(n):',
 //    streamer: $streamer,
 //    maxNewTokens: 100,
-//    doSample: true
+//    doSample: true,
+//    returnFullText: false,
 //);
-
+//
 dd($output[0]['generated_text'], timeUsage(), memoryUsage());
diff --git a/src/FeatureExtractors/ImageFeatureExtractor.php b/src/FeatureExtractors/ImageFeatureExtractor.php
@@ -8,6 +8,7 @@
 use Codewithkyrian\Transformers\Utils\Image;
 use Codewithkyrian\Transformers\Utils\Tensor;
 use Imagine\Image\Point;
+use Interop\Polite\Math\Matrix\NDArray;
 use function Codewithkyrian\Transformers\Utils\timeUsage;
 
 class ImageFeatureExtractor extends FeatureExtractor
@@ -98,37 +99,6 @@ public function __construct(public array $config)
         }
     }
 
-    /**
-     * Resize the image to make a thumbnail.
-     * @param Image $image The image to be resized.
-     * @param array{height: int, width: int} $size The size ['height' => h, 'width' => w] to resize the image to.
-     * @param int $resample The resampling filter to use.
-     * @return Image The resized image.
-     */
-    public function thumbnail(Image $image, array $size, int $resample = 2): Image
-    {
-        $inputHeight = $image->height();
-        $inputWidth = $image->width();
-
-        $outputHeight = $size['height'];
-        $outputWidth = $size['width'];
-
-        // We always resize to the smallest of either the input or output size.
-        $height = min($inputHeight, $outputHeight);
-        $width = min($inputWidth, $outputWidth);
-
-        if ($height === $inputHeight && $width === $inputWidth) {
-            return $image;
-        }
-
-        if ($inputHeight > $inputWidth) {
-            $width = floor($inputWidth * $height / $inputHeight);
-        } elseif ($inputWidth > $inputHeight) {
-            $height = floor($inputHeight * $width / $inputWidth);
-        }
-
-        return $image->resize($width, $height, $resample);
-    }
 
     /**
      * Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold).
@@ -184,24 +154,23 @@ public function cropMargin(Image $image, int $grayThreshold = 200): static
 
     /**
      * Pad the image by a certain amount.
-     * @param array $pixelData The pixel data to pad.
-     * @param int[] $imgShape The dimensions of the image (height, width, channels).
+     * @param Tensor $imageTensor The pixel data to pad.
      * @param int[]|int $padSize The dimensions of the padded image.
      * @param string $mode The type of padding to add.
      * @param bool $center Whether to center the image.
      * @param int $constantValues The constant value to use for padding.
-     * @return array{0: array, 1: int[]} The padded pixel data and image dimensions.
+     * @return Tensor The padded pixel data and image dimensions.
+     * @throws \Exception
      */
     public function padImage(
-        array     $pixelData,
-        array     $imgShape,
+        Tensor    $imageTensor,
         int|array $padSize,
         string    $mode = 'constant',
         bool      $center = false,
         int       $constantValues = 0
-    ): array
+    ): Tensor
     {
-        [$imageHeight, $imageWidth, $imageChannels] = $imgShape;
+        [$imageHeight, $imageWidth, $imageChannels] = $imageTensor->shape();
 
         if (is_array($padSize)) {
             $paddedImageWidth = $padSize['width'];
@@ -213,17 +182,18 @@ public function padImage(
 
         // Only add padding if there is a difference in size
         if ($paddedImageWidth !== $imageWidth || $paddedImageHeight !== $imageHeight) {
-
-            $paddedPixelData = array_fill(0, $paddedImageWidth * $paddedImageHeight * $imageChannels, 0);
+            $paddedShape = [$paddedImageWidth, $paddedImageHeight, $imageChannels];
 
             if (is_array($constantValues)) {
+                $paddedPixelData = Tensor::fill($paddedShape, 0);
+
                 // Fill with constant values, cycling through the array
                 $constantValuesLength = count($constantValues);
-                for ($i = 0; $i < count($paddedPixelData); ++$i) {
-                    $paddedPixelData[$i] = $constantValues[$i % $constantValuesLength];
+                for ($i = 0; $i < $paddedPixelData->size(); ++$i) {
+                    $paddedPixelData->buffer()[$i] = $constantValues[$i % $constantValuesLength];
                 }
-            } else if ($constantValues !== 0) {
-                $paddedPixelData = array_fill(0, $paddedImageWidth * $paddedImageHeight * $imageChannels, $constantValues);
+            } else {
+                $paddedPixelData = Tensor::fill($paddedShape, $constantValues);
             }
 
             [$left, $top] = $center ?
@@ -240,7 +210,7 @@ public function padImage(
                     $d = ($b + $j) * $imageChannels;
 
                     for ($k = 0; $k < $imageChannels; ++$k) {
-                        $paddedPixelData[$c + $k] = $pixelData[$d + $k];
+                        $paddedPixelData->buffer()[$c + $k] = $imageTensor->buffer()[$d + $k];
                     }
                 }
             }
@@ -264,18 +234,17 @@ public function padImage(
 
                         // Copy channel-wise
                         for ($k = 0; $k < $imageChannels; ++$k) {
-                            $paddedPixelData[$c + $k] = $pixelData[$d + $k];
+                            $paddedPixelData->buffer()[$c + $k] = $imageTensor->buffer()[$d + $k];
                         }
                     }
                 }
             }
 
             // Update pixel data and image dimensions
-            $pixelData = $paddedPixelData;
-            $imgShape = [$paddedImageHeight, $paddedImageWidth, $imageChannels];
+            $imageTensor = $paddedPixelData;
         }
 
-        return [$pixelData, $imgShape];
+        return $imageTensor;
     }
 
     private function calculateReflectOffset(int $val, int $max): int
@@ -284,17 +253,6 @@ private function calculateReflectOffset(int $val, int $max): int
         return $mod > $max ? $max - ($mod - $max) : $mod;
     }
 
-    /**
-     * Rescale the image's pixel values by the specified rescale factor.
-     * @param array $pixelData The pixel data to rescale.
-     * @return void
-     */
-    public function rescale(array &$pixelData): void
-    {
-        for ($i = 0; $i < count($pixelData); ++$i) {
-            $pixelData[$i] *= $this->rescaleFactor;
-        }
-    }
 
     /**
      * Find the target (width, height) dimension of the output image after
@@ -379,17 +337,6 @@ public function getResizeOutputImageSize(Image $image, int|array|null $size): ar
         }
     }
 
-    /**
-     * Resizes the image.
-     * @param Image $image The image to resize.
-     * @return Image The resized image.
-     */
-    public function resize(Image $image): Image
-    {
-        [$newWidth, $newHeight] = $this->getResizeOutputImageSize($image, $this->size);
-
-        return $image->resize($newWidth, $newHeight, $this->resample);
-    }
 
     /**
      * Preprocesses the given image.
@@ -417,7 +364,7 @@ public function preprocess(
         }
 
 
-        [$srcWidth, $srcHeight] = $image->size(); // original image size
+        $originalInputSize = $image->size(); // original image size
 
         // Convert image to RGB if specified in config.
         if ($doConvertRGB ?? $this->doConvertRGB) {
@@ -426,14 +373,16 @@ public function preprocess(
             $image->grayscale();
         }
 
-        // Resize all images
+        // Resize if specified in config.
         if ($this->doResize) {
-            $this->resize($image);
+            [$newWidth, $newHeight] = $this->getResizeOutputImageSize($image, $this->size);
+
+            $image->resize($newWidth, $newHeight, $this->resample);
         }
 
         // Resize the image using thumbnail method.
         if ($this->doThumbnail) {
-            $this->thumbnail($image, $this->size, $this->resample);
+            $image->thumbnail($this->size['width'], $this->size['height'], $this->resample);
         }
 
         if ($this->doCenterCrop) {
@@ -449,59 +398,59 @@ public function preprocess(
             $image->centerCrop($cropWidth, $cropHeight);
         }
 
-        $reshapedInputSize = [$image->height(), $image->width()];
-
-        // All pixel-level manipulation occurs with data in the hwc format (height, width, channels),
-        // to emulate the behavior of the original Python code (w/ numpy).
-        $pixelData = $image->pixelData();
+        $reshapedInputSize = $image->size();
 
-        $imgShape = [$image->height(), $image->width(), $image->channels];
+        $imageTensor = $image->toTensor();
 
         if ($this->doRescale) {
-            $this->rescale($pixelData);
+            $imageTensor = $imageTensor->multiply($this->rescaleFactor);
         }
 
-
         if ($doNormalize ?? $this->doNormalize) {
-            $imageMean = $this->imageMean;
-            if (!is_array($this->imageMean)) {
-                $imageMean = array_fill(0, $image->channels, $this->imageMean);
+            if (is_array($this->imageMean)) {
+                // Negate the mean values to add instead of subtract
+                $negatedMean = array_map(fn($mean) => -$mean, $this->imageMean);
+                $imageMean = Tensor::repeat($negatedMean, $image->height() * $image->width(), 1);
+            } else {
+                $imageMean = Tensor::fill([$image->channels * $image->height() * $image->width()], -$this->imageMean);
             }
 
-            $imageStd = $this->imageStd;
-            if (!is_array($this->imageStd)) {
-                $imageStd = array_fill(0, $image->channels, $this->imageMean);
+
+            if (is_array($this->imageStd)) {
+                // Inverse the standard deviation values to multiple instead of divide
+                $inversedStd = array_map(fn($std) => 1 / $std, $this->imageStd);
+                $imageStd = Tensor::repeat($inversedStd, $image->height() * $image->width(), 1);
+            } else {
+                $imageStd = Tensor::fill([$image->channels * $image->height() * $image->width()], 1 / $this->imageStd);
             }
 
+
+            // Reshape mean and std to match the image tensor shape
+            $imageMean = $imageMean->reshape($imageTensor->shape());
+            $imageStd = $imageStd->reshape($imageTensor->shape());
+
             if (count($imageMean) !== $image->channels || count($imageStd) !== $image->channels) {
-                throw new \Exception("When set to arrays, the length of `imageMean` (" . count($imageMean) . ") and `imageStd` (" . count($imageStd) . ") must match the number of channels in the image ({$image->channels()}).");
+                throw new \Exception("When set to arrays, the length of `imageMean` (" . count($imageMean) . ") and `imageStd` (" . count($imageStd) . ") must match the number of channels in the image ({$image->channels}).");
             }
 
             // Normalize pixel data
-            for ($i = 0; $i < count($pixelData); $i += $image->channels) {
-                for ($j = 0; $j < $image->channels; ++$j) {
-                    $pixelData[$i + $j] = ($pixelData[$i + $j] - $imageMean[$j]) / $imageStd[$j];
-                }
-            }
+            $imageTensor = $imageTensor->add($imageMean)->multiply($imageStd);
         }
 
         // Perform padding after rescaling/normalizing
         if ($doPad ?? $this->doPad) {
             if ($this->padSize !== null) {
-                [$pixelData, $imgShape] = $this->padImage($pixelData, $imgShape, $this->padSize);
+                $imageTensor = $this->padImage($imageTensor, $this->padSize);
             } elseif ($this->sizeDivisibility !== null) {
-                [$paddedWidth, $paddedHeight] = $this->enforceSizeDivisibility([$imgShape[1], $imgShape[0]], $this->sizeDivisibility);
-                [$pixelData, $imgShape] = $this->padImage($pixelData, $imgShape, ['width' => $paddedWidth, 'height' => $paddedHeight]);
+                [$paddedWidth, $paddedHeight] = $this->enforceSizeDivisibility([$imageTensor->shape()[1], $imageTensor->shape()[0]], $this->sizeDivisibility);
+                $imageTensor = $this->padImage($imageTensor, ['width' => $paddedWidth, 'height' => $paddedHeight]);
             }
         }
 
-        // Convert to channel dimension format (hwc -> chw)
-        $pixelValues = Tensor::fromArray($pixelData, Tensor::float32, $imgShape)->permute(2, 0, 1);
-
         return [
-            'original_size' => [$srcHeight, $srcWidth],
+            'original_size' => $originalInputSize,
             'reshaped_input_size' => $reshapedInputSize,
-            'pixel_values' => $pixelValues,
+            'pixel_values' => $imageTensor,
         ];
     }
 
diff --git a/src/FeatureExtractors/Swin2SRImageProcessor.php b/src/FeatureExtractors/Swin2SRImageProcessor.php
@@ -5,20 +5,21 @@
 
 namespace Codewithkyrian\Transformers\FeatureExtractors;
 
+use Codewithkyrian\Transformers\Utils\Tensor;
+
 class Swin2SRImageProcessor extends ImageFeatureExtractor
 {
     public function padImage(
-        array     $pixelData,
-        array     $imgShape,
+        Tensor    $imageTensor,
         int|array $padSize,
         string    $mode = 'constant',
         bool      $center = false,
         int       $constantValues = 0
-    ): array
+    ): Tensor
     {
         // NOTE: In this case, `padSize` represents the size of the sliding window for the local attention.
         // In other words, the image is padded so that its width and height are multiples of `padSize`.
-        [$imageHeight, $imageWidth, $imageChannels] = $imgShape;
+        [$imageHeight, $imageWidth, $imageChannels] = $imageTensor->shape();
 
         // NOTE: For Swin2SR models, the original python implementation adds padding even when the image's width/height is already
         // a multiple of `pad_size`. However, this is most likely a bug (PR: https://github.com/mv-lab/swin2sr/pull/19).
@@ -28,6 +29,6 @@ public function padImage(
             'height' => $imageHeight + ($padSize - $imageHeight % $padSize) % $padSize,
         ];
 
-        return parent::padImage($pixelData, $imgShape, $padSize, 'symmetric', false, -1);
+        return parent::padImage($imageTensor, $padSize, 'symmetric', false, -1);
     }
 }
diff --git a/src/Utils/Image.php b/src/Utils/Image.php