More improvements for returning timestamps in ASR

CodeWithKyrian · CodeWithKyrian · commit 0e4b0e26df7e · 2024-08-01T22:59:16.000+01:00
diff --git a/examples/pipelines/asr.php b/examples/pipelines/asr.php
@@ -23,6 +23,6 @@
 //$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
 
 $streamer = StdOutStreamer::make();
-$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30, strideLengthSecs: 6);
+$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30, returnTimestamps: true);
 
 dd($output, timeUsage(), memoryUsage());
diff --git a/src/Models/Pretrained/WhisperForConditionalGeneration.php b/src/Models/Pretrained/WhisperForConditionalGeneration.php
@@ -189,7 +189,7 @@ public function extractTokenTimestamps(
             // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions
             // as the python implementation
             $matrix = $batchedMatrices[$batchIdx]->multiply(-1)->squeeze(0);
-            list($textIndices, $timeIndices) = dynamicTimeWarping($matrix);
+            list($textIndices, $timeIndices) = $this->dynamicTimeWarping($matrix);
 
             $diffs = array_map(fn($i) => $textIndices[$i + 1] - $textIndices[$i], range(0, count($textIndices) - 2));
             $jumps = array_map(fn($x) => (bool)$x, array_merge([1], $diffs));
@@ -216,7 +216,7 @@ function medianFilter(Tensor $tensor, int $windowSize): Tensor
         $outputArray = array_fill(0, count($tensor), 0);
         $buffer = array_fill(0, $windowSize, 0);
 
-        $halfWindowSize = (int) floor($windowSize / 2);
+        $halfWindowSize = (int)floor($windowSize / 2);
 
         for ($i = 0; $i < count($tensor); ++$i) {
             $valuesIndex = 0;
@@ -239,4 +239,75 @@ function medianFilter(Tensor $tensor, int $windowSize): Tensor
         return Tensor::fromArray($outputArray, $tensor->dtype());
     }
 
+    private function dynamicTimeWarping(Tensor $tensor): array
+    {
+        [$rows, $cols] = $tensor->shape();
+
+        $outputShape = [$rows + 1, $cols + 1];
+
+        $cost = Tensor::fill($outputShape, -INF, Tensor::float32);
+        $traceback = Tensor::fill($outputShape, -1, Tensor::int32);
+
+        $cost[0][0] = 0;
+
+        for ($i = 1; $i < $rows + 1; ++$i) {
+            for ($j = 1; $j < $cols + 1; ++$j) {
+                $c0 = $cost[$i - 1][$j - 1];
+                $c1 = $cost[$i - 1][$j];
+                $c2 = $cost[$i][$j - 1];
+
+                if ($c0 <= $c1 && $c0 <= $c2) {
+                    $c = $c0;
+                    $t = 0;
+                } else if ($c1 <= $c0 && $c1 <= $c2) {
+                    $c = $c1;
+                    $t = 1;
+                } else {
+                    $c = $c2;
+                    $t = 2;
+                }
+
+                $cost[$i][$j] = $tensor[$i - 1][$j - 1] + $c;
+                $traceback[$i][$j] = $t;
+            }
+        }
+
+        // Traceback
+        $i = $rows;
+        $j = $cols;
+
+        for ($k = 0; $k < $outputShape[1]; ++$k) {
+            $traceback[0][$k] = 2;
+        }
+
+        for ($k = 0; $k < $outputShape[0]; ++$k) {
+            $traceback[$k][0] = 1;
+        }
+
+        $textIndices = [];
+        $timeIndices = [];
+
+        while ($i > 0 || $j > 0) {
+            $textIndices[] = $i - 1;
+            $timeIndices[] = $j - 1;
+
+            $t = $traceback[$i][$j];
+
+            if ($t === 0) {
+                $i--;
+                $j--;
+            } else if ($t === 1) {
+                $i--;
+            } else {
+                $j--;
+            }
+        }
+
+
+        $textIndices = array_reverse($textIndices);
+        $timeIndices = array_reverse($timeIndices);
+
+        return [$textIndices, $timeIndices];
+    }
+
 }
diff --git a/src/Pipelines/AutomaticSpeechRecognitionPipeline.php b/src/Pipelines/AutomaticSpeechRecognitionPipeline.php
@@ -104,7 +104,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
         $forceFullSequences = $args['forceFullSequences'] ?? false;
         $strideLengthSecs = $args['strideLengthSecs'] ?? null;
 
-        if ($returnTimestamps == 'word') {
+        if ($returnTimestamps === 'word') {
             $args['return_token_timestamps'] = true;
         }
 
@@ -209,7 +209,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
                 // TODO: Right now we only get top beam
                 if ($returnTimestamps === 'word') {
                     $chunk['tokens'] = $data['sequences'][0];
-                    $chunk['token_timestamps'] = array_map(fn($x) => round($x, 2), $data['token_timestamps'][0]);
+                    $chunk['token_timestamps'] = $data['token_timestamps'][0]->round(2);
                 } else {
                     $chunk['tokens'] = $data[0];
                 }
diff --git a/src/Tensor/Tensor.php b/src/Tensor/Tensor.php
@@ -1122,7 +1122,7 @@ public function permute(...$axes): static
     {
         $permuted = self::mo()->transpose($this, $axes);
 
-        return Tensor::fromArray($permuted);
+        return new static($permuted->buffer(), $permuted->dtype(), $permuted->shape(), $permuted->offset());
     }
 
     /**

Original file line number	Diff line number	Diff line change
`@@ -1122,7 +1122,7 @@ public function permute(...$axes): static`
`1122`	`1122`	`{`
`1123`	`1123`	`$permuted = self::mo()->transpose($this, $axes);`
`1124`	`1124`
`1125`		`- return Tensor::fromArray($permuted);`
	`1125`	`+ return new static($permuted->buffer(), $permuted->dtype(), $permuted->shape(), $permuted->offset());`
`1126`	`1126`	`}`
`1127`	`1127`
`1128`	`1128`	`/**`