Fix some errors for whisper, and added a new copyTo method for Tensor

CodeWithKyrian · CodeWithKyrian · commit 124c7f084d5f · 2024-08-01T22:59:16.000+01:00
diff --git a/examples/pipelines/asr.php b/examples/pipelines/asr.php
@@ -14,15 +14,16 @@
 $transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
 
 $audioUrl = __DIR__ . '/../sounds/kyrian-dev.wav';
-$audioUrl = __DIR__ . '/../sounds/jfk.wav';
-$audioUrl = __DIR__ . '/../sounds/preamble.wav';
+//$audioUrl = __DIR__ . '/../sounds/jfk.wav';
+//$audioUrl = __DIR__ . '/../sounds/preamble.wav';
 //$audioUrl = __DIR__ . '/../sounds/taunt.wav';
 //$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
 //$audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
-$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
+//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
+//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking2.wav';
 //$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
 
 $streamer = StdOutStreamer::make();
-$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30,  returnTimestamps: 'word');
+$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 18,  returnTimestamps: 'word');
 
 dd($output, timeUsage(), memoryUsage());
diff --git a/src/Models/Pretrained/WhisperForConditionalGeneration.php b/src/Models/Pretrained/WhisperForConditionalGeneration.php
@@ -141,11 +141,13 @@ public function extractTokenTimestamps(
             $weights = Tensor::stack(array_map(function ($alignmentHead) use ($crossAttentions, $numFrames) {
                 [$l, $h] = $alignmentHead;
                 return $numFrames
-                    ? $crossAttentions[$l]->slice(null, $h, null, [0, $numFrames])->squeeze(1)
-                    : $crossAttentions[$l]->slice(null, $h)->squeeze(1); // experimental
+                    ? $crossAttentions[$l]->slice(null, $h, null, [0, $numFrames])
+                    : $crossAttentions[$l]->slice(null, $h); // experimental
             }, $alignmentHeads));
 
-            $weights = $weights->permute(1, 0, 2, 3);
+            $weights = $weights
+                ->squeeze(1)
+                ->permute(1, 0, 2, 3);
 
             [$std, $calculatedMean] = $weights->stdMean(-2, 0, true);
 
@@ -155,22 +157,25 @@ public function extractTokenTimestamps(
             for ($a = 0; $a < $smoothedWeights->shape()[0]; ++$a) {
                 $aTensor = $smoothedWeights[$a]; // [8, seqLength, 1500]
 
+
                 for ($b = 0; $b < $aTensor->shape()[0]; ++$b) {
                     $bTensor = $aTensor[$b]; // [seqLength, 1500]
 
+
                     $stdTensor = $std[$a][$b][0]; // [1500]
                     $meanTensor = $calculatedMean[$a][$b][0]; // [1500]
 
                     for ($c = 0; $c < $bTensor->shape()[0]; ++$c) {
                         /** @var Tensor $cTensor */
                         $cTensor = $bTensor[$c]; // [1500]
-//                        for ($d = 0; $d < count($cTensor->buffer()); ++$d) {
-//                            $cTensor->buffer()[$d] = ($cTensor->buffer()[$d] - $meanTensor->buffer()[$d]) / $stdTensor->buffer()[$d];
-//                        }
-                        $cTensor = $cTensor->add($meanTensor->multiply(-1))->multiply($stdTensor->reciprocal());
+
+                        $cTensor
+                            ->add($meanTensor->multiply(-1))
+                            ->multiply($stdTensor->reciprocal())
+                            ->copyTo($cTensor);
 
                         // Apply median filter.
-                        $cTensor = $this->medianFilter($cTensor, $medianFilterWidth);
+                        $this->medianFilter($cTensor, $medianFilterWidth)->copyTo($cTensor);
                     }
                 }
             }
@@ -189,7 +194,7 @@ public function extractTokenTimestamps(
             // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions
             // as the python implementation
             $matrix = $batchedMatrices[$batchIdx]->multiply(-1)->squeeze(0);
-            list($textIndices, $timeIndices) = $this->dynamicTimeWarping($matrix);
+            [$textIndices, $timeIndices] = $this->dynamicTimeWarping($matrix);
 
             $diffs = array_map(fn($i) => $textIndices[$i + 1] - $textIndices[$i], range(0, count($textIndices) - 2));
             $jumps = array_map(fn($x) => (bool)$x, array_merge([1], $diffs));
@@ -198,10 +203,12 @@ public function extractTokenTimestamps(
             for ($i = 0; $i < count($jumps); ++$i) {
                 if ($jumps[$i]) {
                     $jumpTimes[] = $timeIndices[$i] * $timePrecision;
-                    // NOTE: No point in rounding here, since we set to Float32Array later
                 }
             }
-            $timestamps->buffer()[$batchIdx] = array_merge([0], $jumpTimes);
+//            $timestamps->buffer()[$batchIdx] = array_merge([0.0], $jumpTimes);
+            for ($i = 0; $i < count($jumpTimes); ++$i) {
+                $timestamps[$batchIdx][$i] = $jumpTimes[$i];
+            }
         }
 
         return $timestamps;
diff --git a/src/Pipelines/AutomaticSpeechRecognitionPipeline.php b/src/Pipelines/AutomaticSpeechRecognitionPipeline.php
@@ -160,7 +160,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
 
                 $window = $chunkLengthSecs * $samplingRate;
                 $stride = $strideLengthSecs * $samplingRate;
-                $jump = $window - 2 * $stride;
+                $jump = (int)floor($window - 2 * $stride);
                 $offset = 0;
 
 
diff --git a/src/Tensor/Tensor.php b/src/Tensor/Tensor.php
@@ -500,6 +500,11 @@ public static function zerosLike(Tensor $other): static
         return new static($ndArray->buffer(), $ndArray->dtype(), $ndArray->shape(), $ndArray->offset());
     }
 
+    public function copyTo(Tensor $other): void
+    {
+        self::mo()->la()->copy($this, $other);
+    }
+
 
     /**
      * Stack an array of tensors along a specified axis.