fix: Window and stride calculation error for whisper

CodeWithKyrian · CodeWithKyrian · commit 241f9b0e0078 · 2024-08-01T23:02:02.000+01:00
diff --git a/examples/pipelines/asr.php b/examples/pipelines/asr.php
@@ -18,23 +18,18 @@
 
 $audioUrl = __DIR__ . '/../sounds/kyrian-dev.wav';
 $audioUrl = __DIR__ . '/../sounds/jfk.wav';
-//$audioUrl = __DIR__ . '/../sounds/preamble.wav';
-//$audioUrl = __DIR__ . '/../sounds/taunt.wav';
-//$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
-//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
+$audioUrl = __DIR__ . '/../sounds/preamble.wav';
+$audioUrl = __DIR__ . '/../sounds/taunt.wav';
+$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
+$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
 $audioUrl = __DIR__ . '/../sounds/ted_60.wav';
 //$audioUrl = __DIR__ . '/../sounds/french-audio.wav';
 
-$streamer = WhisperTextStreamer::make()
-//    ->onTimestampStart(fn($time) => print("$time: "))
-//    ->onTimestampEnd(fn($time) => print("\n"))
-;
 
 $output = $transcriber($audioUrl,
     maxNewTokens: 256,
     chunkLengthSecs: 24,
 //    returnTimestamps: true,
-    streamer: $streamer
 );
 
-//dd($output, timeUsage(), memoryUsage());
+dd($output, timeUsage(), memoryUsage());
diff --git a/examples/pipelines/text-generation.php b/examples/pipelines/text-generation.php
@@ -11,12 +11,12 @@
 use function Codewithkyrian\Transformers\Utils\timeUsage;
 
 ini_set('memory_limit', -1);
-//
+
 //$generator = pipeline('text-generation', 'Xenova/gpt2');
 //$generator = pipeline('text-generation', 'Xenova/Qwen1.5-0.5B-Chat');
 $generator = pipeline('text-generation', 'Xenova/TinyLlama-1.1B-Chat-v1.0');
 
-$streamer = TextStreamer::make();
+$streamer = TextStreamer::make()->shouldSkipPrompt();
 
 $messages = [
     ['role' => 'system', 'content' => 'You are a helpful assistant.'],
@@ -36,14 +36,14 @@
 );
 
 //$generator = pipeline('text-generation', 'Xenova/codegen-350M-mono');
-//$streamer = StdOutStreamer::make();
-//
+//$streamer = TextStreamer::make();
+
 //$output = $generator(
 //    'def fib(n):',
 //    streamer: $streamer,
 //    maxNewTokens: 100,
 //    doSample: true,
-//    returnFullText: false,
+//    returnFullText: true,
 //);
-//
+
 dd($output[0]['generated_text'], timeUsage(), memoryUsage());
diff --git a/src/Generation/Streamers/Streamer.php b/src/Generation/Streamers/Streamer.php
@@ -13,7 +13,7 @@
 abstract class Streamer
 {
     protected array $promptTokens = [];
-    protected bool $skipPrompt;
+    protected bool $skipPrompt = false;
     protected bool $nextTokensArePrompt;
 
     protected PretrainedTokenizer $tokenizer;
diff --git a/src/Pipelines/AutomaticSpeechRecognitionPipeline.php b/src/Pipelines/AutomaticSpeechRecognitionPipeline.php
@@ -11,7 +11,7 @@
 use Codewithkyrian\Transformers\Utils\GenerationConfig;
 use Codewithkyrian\Transformers\Utils\Image;
 use function Codewithkyrian\Transformers\Utils\array_pop_key;
-use function Codewithkyrian\Transformers\Utils\array_to_snake_case;
+use function Codewithkyrian\Transformers\Utils\array_keys_to_snake_case;
 
 /**
  * Pipeline that aims at extracting spoken text contained within some audio.
@@ -111,11 +111,13 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
         $task = array_pop_key($args, 'task');
         $streamer = array_pop_key($args, 'streamer');
 
-        if (!is_null($streamer) && !is_a($streamer, WhisperTextStreamer::class)) {
-            throw new \InvalidArgumentException('`streamer` must be an instance of `WhisperTextStreamer`');
-        }
+//        if (!is_null($streamer) && !is_a($streamer, WhisperTextStreamer::class)) {
+//            throw new \InvalidArgumentException('`streamer` must be an instance of `WhisperTextStreamer`');
+//        }
+
+        if (!is_null($streamer)) trigger_error('`streamer` is not supported yet for Whisper', E_USER_WARNING);
 
-        $kwargs = array_to_snake_case($args);
+        $kwargs = array_keys_to_snake_case($args);
 
         $generationConfig = new GenerationConfig($kwargs);
 
@@ -139,14 +141,12 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
         $timePrecision = $this->processor->featureExtractor->config['chunk_length'] / $this->model->config['max_source_positions'];
         $hopLength = $this->processor->featureExtractor->config['hop_length'];
         $samplingRate = $this->processor->featureExtractor->config['sampling_rate'];
-        $timestampBegin = $this->tokenizer->tokenizer->convertTokensToIds(["<|notimestamps|>"])[0] + 1;
 
         $toReturn = [];
 
-        $streamer?->setTokenizer($this->tokenizer)
-            ?->shouldSkipPrompt(false)
-            ?->setTimePrecision($timePrecision)
-            ?->setTimestampBegin($timestampBegin);
+//        $streamer?->setTokenizer($this->tokenizer)
+//            ?->setTimePrecision($timePrecision)
+//            ?->setTimestampBegin($timestampBegin);
 
         foreach ($inputs as $input) {
             $audio = Audio::read($input);
@@ -168,9 +168,9 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
                 $offset = 0;
 
                 while ($offset < $audioTensor->size()) {
-
                     if ($offset + $window > $audioTensor->size()) {
                         $window = $audioTensor->size() - $offset;
+                        $jump = $window;
                     }
 
                     $subAudio = $audioTensor->sliceWithBounds([$offset], [$window]);
@@ -206,7 +206,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
             foreach ($chunks as &$chunk) {
                 $generationConfig['num_frames'] = (int)floor($chunk['stride'][0] / $hopLength);
 
-                $data = $this->model->generate($chunk['input_features'], generationConfig: $generationConfig, streamer: $streamer);
+                $data = $this->model->generate($chunk['input_features'], generationConfig: $generationConfig);
 
                 // TODO: Right now we only get top beam
                 if ($returnTimestamps === 'word') {
@@ -219,7 +219,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
                 // convert stride to seconds
                 $chunk['stride'] = array_map(fn($x) => $x / $samplingRate, $chunk['stride']);
 
-                $streamer?->notifyChunkEnd($chunk['stride'][0]);
+//                $streamer?->notifyChunkEnd($chunk['stride'][0]);
             }
 
             // Merge text chunks
diff --git a/src/Pipelines/Text2TextGenerationPipeline.php b/src/Pipelines/Text2TextGenerationPipeline.php
@@ -8,7 +8,7 @@
 use Codewithkyrian\Transformers\Generation\Streamers\Streamer;
 use Codewithkyrian\Transformers\Utils\GenerationConfig;
 use function Codewithkyrian\Transformers\Utils\array_pop_key;
-use function Codewithkyrian\Transformers\Utils\array_to_snake_case;
+use function Codewithkyrian\Transformers\Utils\array_keys_to_snake_case;
 
 /**
  * A pipeline for generating text using a model that performs text-to-text generation tasks.
@@ -33,7 +33,7 @@ public function __invoke(array|string $inputs, ...$args): array
         /** @var Streamer $streamer */
         $streamer = array_pop_key($args, 'streamer');
 
-        $kwargs = array_to_snake_case($args);
+        $kwargs = array_keys_to_snake_case($args);
 
         $generateKwargs = new GenerationConfig($kwargs);
 
diff --git a/src/Pipelines/TextGenerationPipeline.php b/src/Pipelines/TextGenerationPipeline.php
@@ -9,7 +9,7 @@
 use Codewithkyrian\Transformers\Utils\GenerationConfig;
 use function Codewithkyrian\Transformers\Utils\array_every;
 use function Codewithkyrian\Transformers\Utils\array_pop_key;
-use function Codewithkyrian\Transformers\Utils\array_to_snake_case;
+use function Codewithkyrian\Transformers\Utils\array_keys_to_snake_case;
 use function Codewithkyrian\Transformers\Utils\camelCaseToSnakeCase;
 use function Codewithkyrian\Transformers\Utils\timeUsage;
 
@@ -62,7 +62,7 @@ public function __invoke(array|string $inputs, ...$args): array
 
         $returnFullText = array_pop_key($args, 'returnFullText', true);
 
-        $kwargs = array_to_snake_case($args);
+        $kwargs = array_keys_to_snake_case($args);
 
         $generationConfig = new GenerationConfig($kwargs);
 
@@ -104,9 +104,7 @@ public function __invoke(array|string $inputs, ...$args): array
             truncation: true
         );
 
-        $streamer?->setTokenizer($this->tokenizer)
-            ?->shouldSkipPrompt()
-            ?->setPromptTokens($inputIds[0]->toArray());
+        $streamer?->setTokenizer($this->tokenizer)?->setPromptTokens($inputIds[0]->toArray());
 
         $outputTokenIds = $this->model->generate($inputIds,
             generationConfig: $generationConfig,
diff --git a/src/Tensor/Tensor.php b/src/Tensor/Tensor.php
@@ -1096,56 +1096,6 @@ public function slice(...$slices): Tensor
         }
 
         return new Tensor($buffer, $this->dtype(), $newShape, $this->offset());
-
-        $newTensorShape = [];
-        $newOffsets = [];
-
-        for ($sliceIndex = 0; $sliceIndex < $this->ndim(); ++$sliceIndex) {
-            $slice = $slices[$sliceIndex] ?? null;
-
-            if ($slice === null) {
-                $newOffsets[] = [0, $this->shape()[$sliceIndex]];
-                $newTensorShape[] = $this->shape()[$sliceIndex];
-
-            } elseif (is_int($slice)) {
-                $slice = $this->safeIndex($slice, $this->shape()[$sliceIndex], $sliceIndex);
-                $newOffsets[] = [$slice, $slice + 1];
-
-            } elseif (is_array($slice) && count($slice) === 2) {
-                if ($slice[0] > $slice[1]) {
-                    throw new Exception("Invalid slice: " . json_encode($slice));
-                }
-                $offsets = [
-                    max($slice[0], 0),
-                    min($slice[1], $this->shape()[$sliceIndex])
-                ];
-                $newOffsets[] = $offsets;
-                $newTensorShape[] = $offsets[1] - $offsets[0];
-
-            } else {
-                throw new Exception("Invalid slice: " . json_encode($slice));
-            }
-        }
-
-        $newShape = array_map(fn($offsets) => $offsets[1] - $offsets[0], $newOffsets);
-
-        $newBufferSize = array_reduce($newShape, fn($a, $b) => $a * $b, 1);
-
-        $buffer = self::newBuffer($newBufferSize, $this->dtype());
-        $stride = $this->stride();
-
-        for ($i = 0; $i < $newBufferSize; ++$i) {
-            $originalIndex = 0;
-            for ($j = count($newShape) - 1, $num = $i; $j >= 0; --$j) {
-                $size = $newShape[$j];
-                $originalIndex += (($num % $size) + $newOffsets[$j][0]) * $stride[$j];
-                $num = floor($num / $size);
-            }
-            $buffer[$i] = $this->buffer[$originalIndex];
-        }
-
-        return new Tensor($buffer, $this->dtype(), $newTensorShape, $this->offset());
-
     }
 
     /**
diff --git a/src/Utils/Helpers.php b/src/Utils/Helpers.php
@@ -71,7 +71,7 @@ function array_pop_key(array &$array, string|int $key, mixed $default = null)
     return $default;
 }
 
-function array_to_snake_case(array $array): array
+function array_keys_to_snake_case(array $array): array
 {
     $snakeCasedArray = [];
 

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`abstract class Streamer`
`14`	`14`	`{`
`15`	`15`	`protected array $promptTokens = [];`
`16`		`- protected bool $skipPrompt;`
	`16`	`+ protected bool $skipPrompt = false;`
`17`	`17`	`protected bool $nextTokensArePrompt;`
`18`	`18`
`19`	`19`	`protected PretrainedTokenizer $tokenizer;`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ function array_pop_key(array &$array, string\|int $key, mixed $default = null)`
`71`	`71`	`return $default;`
`72`	`72`	`}`
`73`	`73`
`74`		`-function array_to_snake_case(array $array): array`
	`74`	`+function array_keys_to_snake_case(array $array): array`
`75`	`75`	`{`
`76`	`76`	`$snakeCasedArray = [];`
`77`	`77`