Skip to content

Commit 740a797

Browse files
Move thumbnail method from feature extractor to Image class
1 parent 91f82f1 commit 740a797

4 files changed

Lines changed: 160 additions & 180 deletions

File tree

examples/pipelines/text-generation.php

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,14 @@
3535
);
3636

3737
//$generator = pipeline('text-generation', 'Xenova/codegen-350M-mono');
38-
//$streamer = StdOutStreamer::make($generator->tokenizer);
38+
//$streamer = StdOutStreamer::make();
3939
//
4040
//$output = $generator(
4141
// 'def fib(n):',
4242
// streamer: $streamer,
4343
// maxNewTokens: 100,
44-
// doSample: true
44+
// doSample: true,
45+
// returnFullText: false,
4546
//);
46-
47+
//
4748
dd($output[0]['generated_text'], timeUsage(), memoryUsage());

src/FeatureExtractors/ImageFeatureExtractor.php

Lines changed: 52 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
use Codewithkyrian\Transformers\Utils\Image;
99
use Codewithkyrian\Transformers\Utils\Tensor;
1010
use Imagine\Image\Point;
11+
use Interop\Polite\Math\Matrix\NDArray;
1112
use function Codewithkyrian\Transformers\Utils\timeUsage;
1213

1314
class ImageFeatureExtractor extends FeatureExtractor
@@ -98,37 +99,6 @@ public function __construct(public array $config)
9899
}
99100
}
100101

101-
/**
102-
* Resize the image to make a thumbnail.
103-
* @param Image $image The image to be resized.
104-
* @param array{height: int, width: int} $size The size ['height' => h, 'width' => w] to resize the image to.
105-
* @param int $resample The resampling filter to use.
106-
* @return Image The resized image.
107-
*/
108-
public function thumbnail(Image $image, array $size, int $resample = 2): Image
109-
{
110-
$inputHeight = $image->height();
111-
$inputWidth = $image->width();
112-
113-
$outputHeight = $size['height'];
114-
$outputWidth = $size['width'];
115-
116-
// We always resize to the smallest of either the input or output size.
117-
$height = min($inputHeight, $outputHeight);
118-
$width = min($inputWidth, $outputWidth);
119-
120-
if ($height === $inputHeight && $width === $inputWidth) {
121-
return $image;
122-
}
123-
124-
if ($inputHeight > $inputWidth) {
125-
$width = floor($inputWidth * $height / $inputHeight);
126-
} elseif ($inputWidth > $inputHeight) {
127-
$height = floor($inputHeight * $width / $inputWidth);
128-
}
129-
130-
return $image->resize($width, $height, $resample);
131-
}
132102

133103
/**
134104
* Crops the margin of the image. Gray pixels are considered margin (i.e., pixels with a value below the threshold).
@@ -184,24 +154,23 @@ public function cropMargin(Image $image, int $grayThreshold = 200): static
184154

185155
/**
186156
* Pad the image by a certain amount.
187-
* @param array $pixelData The pixel data to pad.
188-
* @param int[] $imgShape The dimensions of the image (height, width, channels).
157+
* @param Tensor $imageTensor The pixel data to pad.
189158
* @param int[]|int $padSize The dimensions of the padded image.
190159
* @param string $mode The type of padding to add.
191160
* @param bool $center Whether to center the image.
192161
* @param int $constantValues The constant value to use for padding.
193-
* @return array{0: array, 1: int[]} The padded pixel data and image dimensions.
162+
* @return Tensor The padded pixel data and image dimensions.
163+
* @throws \Exception
194164
*/
195165
public function padImage(
196-
array $pixelData,
197-
array $imgShape,
166+
Tensor $imageTensor,
198167
int|array $padSize,
199168
string $mode = 'constant',
200169
bool $center = false,
201170
int $constantValues = 0
202-
): array
171+
): Tensor
203172
{
204-
[$imageHeight, $imageWidth, $imageChannels] = $imgShape;
173+
[$imageHeight, $imageWidth, $imageChannels] = $imageTensor->shape();
205174

206175
if (is_array($padSize)) {
207176
$paddedImageWidth = $padSize['width'];
@@ -213,17 +182,18 @@ public function padImage(
213182

214183
// Only add padding if there is a difference in size
215184
if ($paddedImageWidth !== $imageWidth || $paddedImageHeight !== $imageHeight) {
216-
217-
$paddedPixelData = array_fill(0, $paddedImageWidth * $paddedImageHeight * $imageChannels, 0);
185+
$paddedShape = [$paddedImageWidth, $paddedImageHeight, $imageChannels];
218186

219187
if (is_array($constantValues)) {
188+
$paddedPixelData = Tensor::fill($paddedShape, 0);
189+
220190
// Fill with constant values, cycling through the array
221191
$constantValuesLength = count($constantValues);
222-
for ($i = 0; $i < count($paddedPixelData); ++$i) {
223-
$paddedPixelData[$i] = $constantValues[$i % $constantValuesLength];
192+
for ($i = 0; $i < $paddedPixelData->size(); ++$i) {
193+
$paddedPixelData->buffer()[$i] = $constantValues[$i % $constantValuesLength];
224194
}
225-
} else if ($constantValues !== 0) {
226-
$paddedPixelData = array_fill(0, $paddedImageWidth * $paddedImageHeight * $imageChannels, $constantValues);
195+
} else {
196+
$paddedPixelData = Tensor::fill($paddedShape, $constantValues);
227197
}
228198

229199
[$left, $top] = $center ?
@@ -240,7 +210,7 @@ public function padImage(
240210
$d = ($b + $j) * $imageChannels;
241211

242212
for ($k = 0; $k < $imageChannels; ++$k) {
243-
$paddedPixelData[$c + $k] = $pixelData[$d + $k];
213+
$paddedPixelData->buffer()[$c + $k] = $imageTensor->buffer()[$d + $k];
244214
}
245215
}
246216
}
@@ -264,18 +234,17 @@ public function padImage(
264234

265235
// Copy channel-wise
266236
for ($k = 0; $k < $imageChannels; ++$k) {
267-
$paddedPixelData[$c + $k] = $pixelData[$d + $k];
237+
$paddedPixelData->buffer()[$c + $k] = $imageTensor->buffer()[$d + $k];
268238
}
269239
}
270240
}
271241
}
272242

273243
// Update pixel data and image dimensions
274-
$pixelData = $paddedPixelData;
275-
$imgShape = [$paddedImageHeight, $paddedImageWidth, $imageChannels];
244+
$imageTensor = $paddedPixelData;
276245
}
277246

278-
return [$pixelData, $imgShape];
247+
return $imageTensor;
279248
}
280249

281250
private function calculateReflectOffset(int $val, int $max): int
@@ -284,17 +253,6 @@ private function calculateReflectOffset(int $val, int $max): int
284253
return $mod > $max ? $max - ($mod - $max) : $mod;
285254
}
286255

287-
/**
288-
* Rescale the image's pixel values by the specified rescale factor.
289-
* @param array $pixelData The pixel data to rescale.
290-
* @return void
291-
*/
292-
public function rescale(array &$pixelData): void
293-
{
294-
for ($i = 0; $i < count($pixelData); ++$i) {
295-
$pixelData[$i] *= $this->rescaleFactor;
296-
}
297-
}
298256

299257
/**
300258
* Find the target (width, height) dimension of the output image after
@@ -379,17 +337,6 @@ public function getResizeOutputImageSize(Image $image, int|array|null $size): ar
379337
}
380338
}
381339

382-
/**
383-
* Resizes the image.
384-
* @param Image $image The image to resize.
385-
* @return Image The resized image.
386-
*/
387-
public function resize(Image $image): Image
388-
{
389-
[$newWidth, $newHeight] = $this->getResizeOutputImageSize($image, $this->size);
390-
391-
return $image->resize($newWidth, $newHeight, $this->resample);
392-
}
393340

394341
/**
395342
* Preprocesses the given image.
@@ -417,7 +364,7 @@ public function preprocess(
417364
}
418365

419366

420-
[$srcWidth, $srcHeight] = $image->size(); // original image size
367+
$originalInputSize = $image->size(); // original image size
421368

422369
// Convert image to RGB if specified in config.
423370
if ($doConvertRGB ?? $this->doConvertRGB) {
@@ -426,14 +373,16 @@ public function preprocess(
426373
$image->grayscale();
427374
}
428375

429-
// Resize all images
376+
// Resize if specified in config.
430377
if ($this->doResize) {
431-
$this->resize($image);
378+
[$newWidth, $newHeight] = $this->getResizeOutputImageSize($image, $this->size);
379+
380+
$image->resize($newWidth, $newHeight, $this->resample);
432381
}
433382

434383
// Resize the image using thumbnail method.
435384
if ($this->doThumbnail) {
436-
$this->thumbnail($image, $this->size, $this->resample);
385+
$image->thumbnail($this->size['width'], $this->size['height'], $this->resample);
437386
}
438387

439388
if ($this->doCenterCrop) {
@@ -449,59 +398,59 @@ public function preprocess(
449398
$image->centerCrop($cropWidth, $cropHeight);
450399
}
451400

452-
$reshapedInputSize = [$image->height(), $image->width()];
453-
454-
// All pixel-level manipulation occurs with data in the hwc format (height, width, channels),
455-
// to emulate the behavior of the original Python code (w/ numpy).
456-
$pixelData = $image->pixelData();
401+
$reshapedInputSize = $image->size();
457402

458-
$imgShape = [$image->height(), $image->width(), $image->channels];
403+
$imageTensor = $image->toTensor();
459404

460405
if ($this->doRescale) {
461-
$this->rescale($pixelData);
406+
$imageTensor = $imageTensor->multiply($this->rescaleFactor);
462407
}
463408

464-
465409
if ($doNormalize ?? $this->doNormalize) {
466-
$imageMean = $this->imageMean;
467-
if (!is_array($this->imageMean)) {
468-
$imageMean = array_fill(0, $image->channels, $this->imageMean);
410+
if (is_array($this->imageMean)) {
411+
// Negate the mean values to add instead of subtract
412+
$negatedMean = array_map(fn($mean) => -$mean, $this->imageMean);
413+
$imageMean = Tensor::repeat($negatedMean, $image->height() * $image->width(), 1);
414+
} else {
415+
$imageMean = Tensor::fill([$image->channels * $image->height() * $image->width()], -$this->imageMean);
469416
}
470417

471-
$imageStd = $this->imageStd;
472-
if (!is_array($this->imageStd)) {
473-
$imageStd = array_fill(0, $image->channels, $this->imageMean);
418+
419+
if (is_array($this->imageStd)) {
420+
// Inverse the standard deviation values to multiple instead of divide
421+
$inversedStd = array_map(fn($std) => 1 / $std, $this->imageStd);
422+
$imageStd = Tensor::repeat($inversedStd, $image->height() * $image->width(), 1);
423+
} else {
424+
$imageStd = Tensor::fill([$image->channels * $image->height() * $image->width()], 1 / $this->imageStd);
474425
}
475426

427+
428+
// Reshape mean and std to match the image tensor shape
429+
$imageMean = $imageMean->reshape($imageTensor->shape());
430+
$imageStd = $imageStd->reshape($imageTensor->shape());
431+
476432
if (count($imageMean) !== $image->channels || count($imageStd) !== $image->channels) {
477-
throw new \Exception("When set to arrays, the length of `imageMean` (" . count($imageMean) . ") and `imageStd` (" . count($imageStd) . ") must match the number of channels in the image ({$image->channels()}).");
433+
throw new \Exception("When set to arrays, the length of `imageMean` (" . count($imageMean) . ") and `imageStd` (" . count($imageStd) . ") must match the number of channels in the image ({$image->channels}).");
478434
}
479435

480436
// Normalize pixel data
481-
for ($i = 0; $i < count($pixelData); $i += $image->channels) {
482-
for ($j = 0; $j < $image->channels; ++$j) {
483-
$pixelData[$i + $j] = ($pixelData[$i + $j] - $imageMean[$j]) / $imageStd[$j];
484-
}
485-
}
437+
$imageTensor = $imageTensor->add($imageMean)->multiply($imageStd);
486438
}
487439

488440
// Perform padding after rescaling/normalizing
489441
if ($doPad ?? $this->doPad) {
490442
if ($this->padSize !== null) {
491-
[$pixelData, $imgShape] = $this->padImage($pixelData, $imgShape, $this->padSize);
443+
$imageTensor = $this->padImage($imageTensor, $this->padSize);
492444
} elseif ($this->sizeDivisibility !== null) {
493-
[$paddedWidth, $paddedHeight] = $this->enforceSizeDivisibility([$imgShape[1], $imgShape[0]], $this->sizeDivisibility);
494-
[$pixelData, $imgShape] = $this->padImage($pixelData, $imgShape, ['width' => $paddedWidth, 'height' => $paddedHeight]);
445+
[$paddedWidth, $paddedHeight] = $this->enforceSizeDivisibility([$imageTensor->shape()[1], $imageTensor->shape()[0]], $this->sizeDivisibility);
446+
$imageTensor = $this->padImage($imageTensor, ['width' => $paddedWidth, 'height' => $paddedHeight]);
495447
}
496448
}
497449

498-
// Convert to channel dimension format (hwc -> chw)
499-
$pixelValues = Tensor::fromArray($pixelData, Tensor::float32, $imgShape)->permute(2, 0, 1);
500-
501450
return [
502-
'original_size' => [$srcHeight, $srcWidth],
451+
'original_size' => $originalInputSize,
503452
'reshaped_input_size' => $reshapedInputSize,
504-
'pixel_values' => $pixelValues,
453+
'pixel_values' => $imageTensor,
505454
];
506455
}
507456

src/FeatureExtractors/Swin2SRImageProcessor.php

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,21 @@
55

66
namespace Codewithkyrian\Transformers\FeatureExtractors;
77

8+
use Codewithkyrian\Transformers\Utils\Tensor;
9+
810
class Swin2SRImageProcessor extends ImageFeatureExtractor
911
{
1012
public function padImage(
11-
array $pixelData,
12-
array $imgShape,
13+
Tensor $imageTensor,
1314
int|array $padSize,
1415
string $mode = 'constant',
1516
bool $center = false,
1617
int $constantValues = 0
17-
): array
18+
): Tensor
1819
{
1920
// NOTE: In this case, `padSize` represents the size of the sliding window for the local attention.
2021
// In other words, the image is padded so that its width and height are multiples of `padSize`.
21-
[$imageHeight, $imageWidth, $imageChannels] = $imgShape;
22+
[$imageHeight, $imageWidth, $imageChannels] = $imageTensor->shape();
2223

2324
// NOTE: For Swin2SR models, the original python implementation adds padding even when the image's width/height is already
2425
// a multiple of `pad_size`. However, this is most likely a bug (PR: https://github.com/mv-lab/swin2sr/pull/19).
@@ -28,6 +29,6 @@ public function padImage(
2829
'height' => $imageHeight + ($padSize - $imageHeight % $padSize) % $padSize,
2930
];
3031

31-
return parent::padImage($pixelData, $imgShape, $padSize, 'symmetric', false, -1);
32+
return parent::padImage($imageTensor, $padSize, 'symmetric', false, -1);
3233
}
3334
}

0 commit comments

Comments
 (0)