Return old tensor slice for dimensions > 3

CodeWithKyrian · CodeWithKyrian · commit 0f9610aeee09 · 2024-08-01T22:59:16.000+01:00
diff --git a/examples/pipelines/asr.php b/examples/pipelines/asr.php
@@ -20,9 +20,9 @@
 $audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
 $audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
 $audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
-$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
+//$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
 
 $streamer = StdOutStreamer::make();
-$output = $transcriber($audioUrl, maxNewTokens: 256, returnTimestamps: 'word');
+$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30, strideLengthSecs: 6);
 
 dd($output, timeUsage(), memoryUsage());
diff --git a/src/Models/Pretrained/WhisperForConditionalGeneration.php b/src/Models/Pretrained/WhisperForConditionalGeneration.php
@@ -14,6 +14,7 @@
 use Codewithkyrian\Transformers\Utils\AutoConfig;
 use Codewithkyrian\Transformers\Utils\GenerationConfig;
 use Exception;
+use InvalidArgumentException;
 
 class WhisperForConditionalGeneration extends WhisperPretrainedModel
 {
@@ -66,7 +67,6 @@ public function generate(
         }
 
 
-
         if (isset($generationConfig['return_token_timestamps'])) {
             $generationConfig['output_attentions'] = true;
             $generationConfig['return_dict_in_generate'] = true;
@@ -109,12 +109,13 @@ public function generate(
      * @throws Exception If the model outputs do not contain cross attentions
      */
     public function extractTokenTimestamps(
-        array $generateOutputs,
-        array $alignmentHeads,
+        array          $generateOutputs,
+        array          $alignmentHeads,
         int|float|null $numFrames = null,
-        float $timePrecision = 0.02
-    ): Tensor {
-        $numFrames = (int) $numFrames;
+        float          $timePrecision = 0.02
+    ): Tensor
+    {
+        $numFrames = (int)$numFrames;
         if (!isset($generateOutputs['cross_attentions'])) {
             throw new Exception(
                 "Model outputs must contain cross attentions to extract timestamps. " .
@@ -128,7 +129,7 @@ public function extractTokenTimestamps(
             $medianFilterWidth = 7;
         }
 
-        $batchedMatrices = array_map(function($batch) use ($numFrames, $alignmentHeads, $medianFilterWidth) {
+        $batchedMatrices = array_map(function ($batch) use ($numFrames, $alignmentHeads, $medianFilterWidth) {
             // Create a list with `decoder_layers` elements, each a tensor of shape
             // (batch size, attention_heads, output length, input length).
             /** @var Tensor[] $crossAttentions */
@@ -137,61 +138,61 @@ public function extractTokenTimestamps(
                 $crossAttentions[] = Tensor::concat(array_map(fn($x) => $x[$i], $batch), 2);
             }
 
-            $weights = Tensor::stack(array_map(function($alignmentHead) use ($crossAttentions, $numFrames) {
+            $weights = Tensor::stack(array_map(function ($alignmentHead) use ($crossAttentions, $numFrames) {
                 [$l, $h] = $alignmentHead;
                 return $numFrames
-                    ? $crossAttentions[$l]->slice(null, $h, null, [0, $numFrames])
-                    : $crossAttentions[$l]->slice(null, $h);
+                    ? $crossAttentions[$l]->slice(null, $h, null, [0, $numFrames])->squeeze(1)
+                    : $crossAttentions[$l]->slice(null, $h)->squeeze(1); // experimental
             }, $alignmentHeads));
-            dd($weights->shape());
-
-            $weights = $weights->permute( 1, 0, 2, 3);
 
+            $weights = $weights->permute(1, 0, 2, 3);
 
-            list($std, $calculatedMean) = std_mean($weights, -2, 0, true);
+            [$std, $calculatedMean] = $weights->stdMean(-2, 0, true);
 
             // Normalize and smoothen the weights.
-            $smoothedWeights = $weights->clone(); // [1, 8, seqLength, 1500]
+            $smoothedWeights = clone $weights; // [1, 8, seqLength, 1500]
 
-            for ($a = 0; $a < $smoothedWeights->dims[0]; ++$a) {
+            for ($a = 0; $a < $smoothedWeights->shape()[0]; ++$a) {
                 $aTensor = $smoothedWeights[$a]; // [8, seqLength, 1500]
 
-                for ($b = 0; $b < $aTensor->dims[0]; ++$b) {
+                for ($b = 0; $b < $aTensor->shape()[0]; ++$b) {
                     $bTensor = $aTensor[$b]; // [seqLength, 1500]
 
                     $stdTensor = $std[$a][$b][0]; // [1500]
                     $meanTensor = $calculatedMean[$a][$b][0]; // [1500]
 
-                    for ($c = 0; $c < $bTensor->dims[0]; ++$c) {
+                    for ($c = 0; $c < $bTensor->shape()[0]; ++$c) {
+                        /** @var Tensor $cTensor */
                         $cTensor = $bTensor[$c]; // [1500]
-                        for ($d = 0; $d < count($cTensor->data); ++$d) {
-                            $cTensor->data[$d] = ($cTensor->data[$d] - $meanTensor->data[$d]) / $stdTensor->data[$d];
-                        }
+//                        for ($d = 0; $d < count($cTensor->buffer()); ++$d) {
+//                            $cTensor->buffer()[$d] = ($cTensor->buffer()[$d] - $meanTensor->buffer()[$d]) / $stdTensor->buffer()[$d];
+//                        }
+                        $cTensor = $cTensor->add($meanTensor->multiply(-1))->multiply($stdTensor->reciprocal());
 
                         // Apply median filter.
-                        $cTensor->data = medianFilter($cTensor->data, $medianFilterWidth);
+                        $cTensor = $this->medianFilter($cTensor, $medianFilterWidth);
                     }
                 }
             }
 
             // Average the different cross-attention heads.
-            $matrix = mean($smoothedWeights, 1);
-            return $matrix;
+            return $smoothedWeights->mean(1);
         }, $generateOutputs['cross_attentions']);
 
         $timestampsShape = [count($generateOutputs['sequences']), count($generateOutputs['sequences'][0])];
 
+
         $timestamps = new Tensor(null, Tensor::float32, $timestampsShape);
 
         // Perform dynamic time warping on each element of the batch.
         for ($batchIdx = 0; $batchIdx < $timestampsShape[0]; ++$batchIdx) {
             // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions
             // as the python implementation
-            $matrix = $batchedMatrices[$batchIdx]->neg()->squeeze_(0);
+            $matrix = $batchedMatrices[$batchIdx]->multiply(-1)->squeeze(0);
             list($textIndices, $timeIndices) = dynamicTimeWarping($matrix);
 
             $diffs = array_map(fn($i) => $textIndices[$i + 1] - $textIndices[$i], range(0, count($textIndices) - 2));
-            $jumps = array_map(fn($x) => (bool) $x, array_merge([1], $diffs));
+            $jumps = array_map(fn($x) => (bool)$x, array_merge([1], $diffs));
 
             $jumpTimes = [];
             for ($i = 0; $i < count($jumps); ++$i) {
@@ -206,4 +207,36 @@ public function extractTokenTimestamps(
         return $timestamps;
     }
 
+    function medianFilter(Tensor $tensor, int $windowSize): Tensor
+    {
+        if ($windowSize % 2 === 0 || $windowSize <= 0) {
+            throw new InvalidArgumentException('Window size must be a positive odd number');
+        }
+
+        $outputArray = array_fill(0, count($tensor), 0);
+        $buffer = array_fill(0, $windowSize, 0);
+
+        $halfWindowSize = (int) floor($windowSize / 2);
+
+        for ($i = 0; $i < count($tensor); ++$i) {
+            $valuesIndex = 0;
+
+            for ($j = -$halfWindowSize; $j <= $halfWindowSize; ++$j) {
+                $index = $i + $j;
+                if ($index < 0) {
+                    $index = abs($index);
+                } else if ($index >= count($tensor)) {
+                    $index = 2 * (count($tensor) - 1) - $index;
+                }
+
+                $buffer[$valuesIndex++] = $tensor->buffer()[$index];
+            }
+
+            sort($buffer);
+            $outputArray[$i] = $buffer[$halfWindowSize];
+        }
+
+        return Tensor::fromArray($outputArray, $tensor->dtype());
+    }
+
 }
diff --git a/src/Tensor/Tensor.php b/src/Tensor/Tensor.php
@@ -727,6 +727,15 @@ public function transpose(): self
         return new static($ndArray->buffer(), $ndArray->dtype(), $ndArray->shape(), $ndArray->offset());
     }
 
+    public function reciprocal(): self
+    {
+        $mo = self::mo();
+
+        $ndArray = $mo->la()->reciprocal($this);
+
+        return new static($ndArray->buffer(), $ndArray->dtype(), $ndArray->shape(), $ndArray->offset());
+    }
+
     /**
      * Performs `L_p` normalization of inputs over specified dimension.
      *
@@ -907,6 +916,74 @@ public function mean(?int $axis = null, bool $keepShape = false): static|float|i
         return $mean;
     }
 
+    /**
+     * Calculates the standard deviation and mean over the dimensions specified by dim. dim can be a
+     * single dimension or `null` to reduce over all dimensions.
+     *
+     * @param int|null $axis The dimension to reduce. If `null`, reduces over all dimensions.
+     * @param int $correction The type of normalization. Default is 0.
+     * @param bool $keepShape Whether to keep the reduced dimension or not.
+     *
+     * @return array The standard deviation and mean of the tensor.
+     */
+    public function stdMean(?int $axis = null, int $correction = 1, bool $keepShape = false): array
+    {
+        $mo = self::mo();
+
+        if ($axis === null) {
+            $mean = $mo->mean($this);
+            $std = sqrt($mo->sum($mo->la()->pow($mo->la()->sub($this, $mean), 2)) / ($this->size() - $correction));
+
+            return [
+                Tensor::fromArray([$mean], $this->dtype()),
+                Tensor::fromArray([$std], $this->dtype())
+            ];
+        }
+
+        $axis = $this->safeIndex($axis, $this->ndim());
+
+        $mean = $mo->mean($this, $axis);
+
+        $resultShape = $this->shape();
+        $resultShape[$axis] = 1;
+
+        $result = $this->zeros([count($this->buffer) / $this->shape()[$axis]], $this->dtype());
+
+        for ($i = 0; $i < count($this->buffer); ++$i) {
+            $resultIndex = 0;
+            $num = $i;
+            $resultMultiplier = 1;
+
+            for ($j = $this->ndim() - 1; $j >= 0; --$j) {
+                $size = $this->shape()[$j];
+
+                if ($j !== $axis) {
+                    $index = $num % $size;
+                    $resultIndex += $index * $resultMultiplier;
+                    $resultMultiplier *= $resultShape[$j];
+                }
+
+                $num = floor($num / $size);
+            }
+
+            $result->buffer[$resultIndex] += pow($this->buffer[$i] - $mean->buffer()[$resultIndex], 2);
+        }
+
+        for ($i = 0; $i < count($result->buffer); ++$i) {
+            $result->buffer[$i] = sqrt($result->buffer[$i] / ($this->shape()[$axis] - $correction));
+        }
+
+        if (!$keepShape) {
+            array_splice($resultShape, $axis, 1);
+        }
+
+        return [
+            new static($result->buffer(), $result->dtype(), $resultShape, $result->offset()),
+            new static($mean->buffer(), $mean->dtype(), $resultShape, $mean->offset()),
+        ];
+    }
+
+
     /**
      * Perform mean pooling of the tensor followed by a normalization step.
      *
@@ -977,7 +1054,7 @@ public function slice(...$slices): Tensor
                 $slice = $this->safeIndex($slice, $this->shape()[$sliceIndex], $sliceIndex);
 
                 $start[] = $slice;
-                $size[] =  1;
+                $size[] = 1;
 
             } elseif (is_array($slice) && count($slice) === 2) {
                 // An array of length 2 means take a range of elements
@@ -993,7 +1070,29 @@ public function slice(...$slices): Tensor
             }
         }
 
-        return $this->sliceWithBounds($start, $size);
+        if (count($size) <= 3) {
+            return $this->sliceWithBounds($start, $size);
+        }
+
+        // The sliceWithBounds method only supports up to 3 dimensions,
+        // so we need to slice manually for higher dimensions
+        $newShape = $size;
+        $newBufferSize = array_product($size);
+
+        $buffer = self::newBuffer($newBufferSize, $this->dtype());
+        $stride = $this->stride();
+
+        for ($i = 0; $i < $newBufferSize; ++$i) {
+            $originalIndex = 0;
+            for ($j = count($newShape) - 1, $num = $i; $j >= 0; --$j) {
+                $size = $newShape[$j];
+                $originalIndex += (($num % $size) + $start[$j]) * $stride[$j];
+                $num = floor($num / $size);
+            }
+            $buffer[$i] = $this->buffer[$originalIndex];
+        }
+
+        return new Tensor($buffer, $this->dtype(), $newShape, $this->offset());
     }
 
     /**