Fix audio classification window data type

CodeWithKyrian · CodeWithKyrian · commit cfee9eb76595 · 2024-08-01T22:59:16.000+01:00
diff --git a/examples/pipelines/asr.php b/examples/pipelines/asr.php
@@ -16,13 +16,13 @@
 $audioUrl = __DIR__ . '/../sounds/kyrian-dev.wav';
 $audioUrl = __DIR__ . '/../sounds/jfk.wav';
 $audioUrl = __DIR__ . '/../sounds/preamble.wav';
-$audioUrl = __DIR__ . '/../sounds/taunt.wav';
-$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
-$audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
+//$audioUrl = __DIR__ . '/../sounds/taunt.wav';
+//$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
+//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
 $audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
 //$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
 
 $streamer = StdOutStreamer::make();
-$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30, returnTimestamps: true);
+$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30,  returnTimestamps: 'word');
 
 dd($output, timeUsage(), memoryUsage());
diff --git a/src/FeatureExtractors/ASTFeatureExtractor.php b/src/FeatureExtractors/ASTFeatureExtractor.php
@@ -12,7 +12,7 @@
 class ASTFeatureExtractor extends FeatureExtractor
 {
     protected array $melFilters;
-    protected \SplFixedArray $window;
+    protected Tensor $window;
     protected mixed $mean;
     protected mixed $std;
 
diff --git a/src/Pipelines/AutomaticSpeechRecognitionPipeline.php b/src/Pipelines/AutomaticSpeechRecognitionPipeline.php
@@ -228,7 +228,6 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
                 returnTimestamps: $returnTimestamps,
                 forceFullSequences: $forceFullSequences
             );
-
             $toReturn[] = ['text' => $fullText, ...$optional];
         }
 
diff --git a/src/PretrainedTokenizers/WhisperTokenizer.php b/src/PretrainedTokenizers/WhisperTokenizer.php
@@ -177,6 +177,7 @@ public function decodeASR(
         // - Lots of complexity comes from stride and timestamps
 
         $lastLanguage = null;
+
         $returnWordTimestamps = $returnTimestamps === "word";
 
         $newChunk = fn() => ["language" => null, "timestamp" => [null, null], "text" => ""];
@@ -275,11 +276,6 @@ public function decodeASR(
                     } else {
                         // This is the end of the timestamp chunk
                         if ($roundedTime !== $chunk['timestamp'][0]) {
-                            // This is a bug in timestamp token output
-                            // where we're taking the duplicate token
-                            // as a stop where it should be a start.
-                            // This is an issue in the underlying model output
-                            // Let's just skip it so it becomes de-factor a start agin
                             $chunk['timestamp'][1] = $roundedTime;
                             $previousTokens[] = $currentTokens;
 
@@ -308,6 +304,13 @@ public function decodeASR(
                             $currentTokenTimestamps = [];
                             $chunk = $newChunk();
                         }
+                        else {
+                            // This is a bug in timestamp token output
+                            // where we're taking the duplicate token
+                            // as a stop where it should be a start.
+                            // This is an issue in the underlying model output
+                            // Let's just skip it so it becomes de-factor a start agin
+                        }
                     }
                 } else {
                     // 4/ Regular token
@@ -419,7 +422,6 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
             $rightSequence = $sequences[$i];
             $max = 0.0;
             $maxIndices = [$leftLength, $leftLength, 0, 0];
-//            dd($this->decode($leftSequence), $this->decode($rightSequence));
 
             $rightLength = count($rightSequence);
             for ($j = 1; $j < $leftLength + $rightLength; ++$j) {
@@ -436,9 +438,8 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
                     throw new Exception("There is a bug within whisper `decodeASR` function, please report it. Dropping to prevent bad inference.");
                 }
 
-                $matches = count(array_filter(
-                        array_map(fn($elem, $idx) => $elem === $right[$idx], $left, array_keys($left))
-                    )
+                $matches = count(
+                    array_filter($left, fn($elem, $idx) => $elem === $right[$idx], ARRAY_FILTER_USE_BOTH)
                 );
 
                 $matching = $matches / $j + $eps;
@@ -494,8 +495,6 @@ public function collateWordTimestamps($tokens, $token_timestamps, $language): ar
      * and a list of `token_id` sequences with the tokens making up each word.
      * @param array $tokens
      * @param string|null $language
-     * @param string $prependPunctuations
-     * @param string $appendPunctuations
      * @return array
      * @private
      */
@@ -573,34 +572,34 @@ private function decodeWithTimestamps(
      */
     private function splitTokensOnUnicode(array $tokens): array
     {
-        $decoded_full = $this->decode($tokens, decodeWithTimestamps: true);
+        $decodedFull = $this->decode($tokens, decodeWithTimestamps: true);
 
-        $replacement_char = "\u{FFFD}";
+        $replacementChar = "\u{FFFD}";
 
         $words = [];
-        $word_tokens = [];
-        $token_indices = [];
-        $current_tokens = [];
-        $current_indices = [];
-        $unicode_offset = 0;
+        $wordTokens = [];
+        $tokenIndices = [];
+        $currentTokens = [];
+        $currentIndices = [];
+        $unicodeOffset = 0;
 
         foreach ($tokens as $token_idx => $token) {
-            $current_tokens[] = $token;
-            $current_indices[] = $token_idx;
+            $currentTokens[] = $token;
+            $currentIndices[] = $token_idx;
 
-            $decoded = $this->decode($tokens, decodeWithTimestamps: true);
+            $decoded = $this->decode($currentTokens, decodeWithTimestamps: true);
 
-            if (!str_contains($decoded, $replacement_char) || $decoded_full[$unicode_offset + strpos($decoded, $replacement_char)] === $replacement_char) {
+            if (!str_contains($decoded, $replacementChar) || $decodedFull[$unicodeOffset + strpos($decoded, $replacementChar)] === $replacementChar) {
                 $words[] = $decoded;
-                $word_tokens[] = $current_tokens;
-                $token_indices[] = $current_indices;
-                $current_tokens = [];
-                $current_indices = [];
-                $unicode_offset += strlen($decoded);
+                $wordTokens[] = $currentTokens;
+                $tokenIndices[] = $currentIndices;
+                $currentTokens = [];
+                $currentIndices = [];
+                $unicodeOffset += strlen($decoded);
             }
         }
 
-        return [$words, $word_tokens, $token_indices];
+        return [$words, $wordTokens, $tokenIndices];
     }
 
     /**
@@ -617,7 +616,9 @@ private function splitTokensOnSpaces(array $tokens): array
         $word_tokens = [];
         $token_indices = [];
 
-        $punctuationRegex = '/^\p{P}+$/u';
+//        $punctuationRegex = '/^\p{P}+$/u';
+        $punctuationRegex = '\p{P}\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E';
+        $punctuationRegex = "/\s+|([$punctuationRegex])+/u";
 
         foreach ($subwords as $i => $subword) {
             $subwordTokens = $subwordTokensList[$i];
diff --git a/src/Tensor/TensorBuffer.php b/src/Tensor/TensorBuffer.php
@@ -210,4 +210,9 @@ public function load(string $string): void
 
         FFI::memcpy($this->data, $string, $strlen);
     }
+
+    public function __clone()
+    {
+        $this->data = clone $this->data;
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`class ASTFeatureExtractor extends FeatureExtractor`
`13`	`13`	`{`
`14`	`14`	`protected array $melFilters;`
`15`		`- protected \SplFixedArray $window;`
	`15`	`+ protected Tensor $window;`
`16`	`16`	`protected mixed $mean;`
`17`	`17`	`protected mixed $std;`
`18`	`18`
Original file line number	Diff line number	Diff line change
`@@ -228,7 +228,6 @@ private function __invokeWhisper(array\|string $inputs, ...$args): array\|Tensor\|I`
`228`	`228`	`returnTimestamps: $returnTimestamps,`
`229`	`229`	`forceFullSequences: $forceFullSequences`
`230`	`230`	`);`
`231`		`-`
`232`	`231`	`$toReturn[] = ['text' => $fullText, ...$optional];`
`233`	`232`	`}`
`234`	`233`
Original file line number	Diff line number	Diff line change
`@@ -210,4 +210,9 @@ public function load(string $string): void`
`210`	`210`
`211`	`211`	`FFI::memcpy($this->data, $string, $strlen);`
`212`	`212`	`}`
	`213`	`+`
	`214`	`+ public function __clone()`
	`215`	`+ {`
	`216`	`+ $this->data = clone $this->data;`
	`217`	`+ }`
`213`	`218`	`}`