Refactor Tensor slicing and unsqueeze for speed, improve generation config serialization, new Tensor methods - sum and maximum, improve whisper

CodeWithKyrian · CodeWithKyrian · commit eb342ec0dadd · 2024-08-01T22:59:16.000+01:00
diff --git a/examples/pipelines/asr.php b/examples/pipelines/asr.php
@@ -18,8 +18,11 @@
 $audioUrl = __DIR__ . '/../sounds/preamble.wav';
 $audioUrl = __DIR__ . '/../sounds/taunt.wav';
 $audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
+$audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
+$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
+$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
 
 $streamer = StdOutStreamer::make();
-$output = $transcriber($audioUrl, maxNewTokens: 256, streamer: $streamer);
+$output = $transcriber($audioUrl, maxNewTokens: 256, returnTimestamps: 'word');
 
-dd( timeUsage(), memoryUsage());
+dd($output, timeUsage(), memoryUsage());
diff --git a/src/FeatureExtractors/WhisperFeatureExtractor.php b/src/FeatureExtractors/WhisperFeatureExtractor.php
@@ -42,15 +42,13 @@ public function __invoke(Tensor $waveform): array
                 'If using a pipeline to extract transcript from a long audio clip,' .
                 'remember to specify `chunkLengthSecs` and/or `strideLengthSecs` in the pipeline options.', E_USER_WARNING);
 
-            $waveform = $waveform->slice(0, $this->config['n_samples']);
-        } else {
-            $padding = $this->config['n_samples'] - $waveform->size();
-            // create a new Tensor with the same data type as the input waveform
-            $padding = Tensor::zeros([$padding], dtype: $waveform->dtype());
+            $waveform = $waveform->sliceWithBounds([0], [$this->config['n_samples']]);
+        } else if ($waveform->size() < $this->config['n_samples']) {
+            $padLength = $this->config['n_samples'] - $waveform->size();
+            $padding = Tensor::zeros([$padLength], dtype: $waveform->dtype());
             $waveform = Tensor::concat([$waveform, $padding]);
         }
 
-        timeUsage();
         $features = Audio::spectrogram(
             $waveform,
             $this->window,
@@ -59,13 +57,15 @@ public function __invoke(Tensor $waveform): array
             power: 2.0,
             melFilters: $this->config['mel_filters'],
             logMel: 'log10',
-
             maxNumFrames: $this->config['nb_max_frames'],
         );
 
         $maxValue = $features->max();
 
-        $features->u(fn($x) => (max($x, $maxValue - 8.0) + 4.0) / 4.0);
+        $features = $features
+            ->maximum($maxValue - 8.0)
+            ->add(4.0)
+            ->multiply(1.0 / 4.0);
 
         return [
             'input_features' => $features->unsqueeze(0)
diff --git a/src/Generation/LogitsProcessors/LogitsProcessorList.php b/src/Generation/LogitsProcessors/LogitsProcessorList.php
@@ -43,11 +43,6 @@ public function extend(traversable $items): void
      */
     public function __invoke(array $inputIds, Tensor &$batchedLogits): void
     {
-//        foreach ($batchedLogits as &$logits) {
-//            foreach ($this->processors as $processor) {
-//                $processor($inputIds, $logits); // Apply processors in-place
-//            }
-//        }
         for ($i = 0; $i < count($batchedLogits); $i++) {
             foreach ($this->processors as $processor) {
                 $processor($inputIds, $batchedLogits[$i]); // Apply processors in-place
@@ -62,7 +57,6 @@ public function __invoke(array $inputIds, Tensor &$batchedLogits): void
      */
     public function getIterator(): Traversable
     {
-//        return new \ArrayIterator($this->processors);
         yield from $this->processors;
     }
 }
diff --git a/src/Generation/LogitsProcessors/WhisperTimeStampLogitsProcessor.php b/src/Generation/LogitsProcessors/WhisperTimeStampLogitsProcessor.php
@@ -45,9 +45,12 @@ public function __construct(GenerationConfig $generateConfig)
         $this->timestampBegin = $this->noTimestampsTokenId + 1;
 
         $this->beginIndex = count($generateConfig['forced_decoder_ids'] ?? []) + 2;
-        if (end($generateConfig['forced_decoder_ids'])[1] === $this->noTimestampsTokenId) {
+
+        $forcedDecoderIds = $generateConfig['forced_decoder_ids'] ?? [];
+        if (count($forcedDecoderIds) > 0 && end($forcedDecoderIds)[1] === $this->noTimestampsTokenId) {
             $this->beginIndex -= 1;
         }
+
         $this->maxInitialTimestampIndex = $generateConfig['max_initial_timestamp_index'] ?? null;
     }
 
@@ -94,10 +97,10 @@ public function __invoke(array $inputIds, Tensor $logits): Tensor
         }
 
         // if sum of probability over timestamps is above any other token, sample timestamp
-//        $logProbs = log_softmax($logitsData);
         $logProbs = $logits->softmax()->log();
-        $timestampLogProb = log(array_sum(array_map('exp', array_slice($logProbs, $this->timestampBegin))));
-        $maxTextTokenLogProb = max(array_slice($logProbs, 0, $this->timestampBegin));
+        $a = $logProbs->sliceWithBounds([0, $this->timestampBegin], [1, $logProbs->size() - $this->timestampBegin]);
+        $timestampLogProb = log($a->exp()->sum());
+        $maxTextTokenLogProb = $logProbs->sliceWithBounds([0, 0], [1, $this->timestampBegin])->max();
 
         if ($timestampLogProb > $maxTextTokenLogProb) {
             for ($i = 0; $i < $this->timestampBegin; $i++) {
diff --git a/src/Generation/Samplers/Sampler.php b/src/Generation/Samplers/Sampler.php
@@ -46,31 +46,24 @@ abstract public function sample(Tensor $logits, int $index);
      */
     public function getLogits(Tensor $logits, int $index): Tensor
     {
-        $vocabSize = $logits->shape()[$logits->ndim() - 1];
+//        $vocabSize = $logits->shape()[$logits->ndim() - 1];
 
-//        $logs = $logits->buffer()->toArray();
+//        $start = array_fill(0, $logits->ndim(), 0);
+//        $size = array_fill(0, $logits->ndim(), 1);
 //
-//        if ($index === -1) {
-//            $logs = array_slice($logs, -$vocabSize);
-//        } else {
-//            $startIndex = $index * $vocabSize;
-//            $logs = array_slice($logs, $startIndex, $startIndex + $vocabSize);
-//        }
-
-        $start = array_fill(0, $logits->ndim(), 0);
-        $size = array_fill(0, $logits->ndim(), 1);
-
-        array_splice($start, -2, replacement: [$index, 0]);
-        array_splice($size, -2, replacement: [1, $vocabSize]);
+//        array_splice($start, -2, replacement: [$index, 0]);
+//        array_splice($size, -2, replacement: [1, $vocabSize]);
+//
+//        $logs = $logits->sliceWithBounds($start, $size);
 
-        $logs = $logits->newSlice($start, $size);
+        $logits = $logits->slice($index);
 
         if ($this->generationConfig->temperature > 0) {
-            $logs = $logs->multiply(1 / $this->generationConfig->temperature);
+            $logits = $logits->multiply(1 / $this->generationConfig->temperature);
         }
 
         // Remove all dimensions of 1, leaving a flat 1D array of vocab_size
-        return $logs->squeeze();
+        return $logits->squeeze();
     }
 
     /**
diff --git a/src/Models/Pretrained/PretrainedModel.php b/src/Models/Pretrained/PretrainedModel.php
@@ -770,7 +770,6 @@ protected function getGenerationConfig(?GenerationConfig $generationConfig): Gen
             $genConfigArray = array_merge($genConfigArray, $this->generationConfig->toArray());
         }
 
-
         // Finally, use any generation config specified by the user
         // when calling `generate`
         if ($generationConfig !== null) {
diff --git a/src/Models/Pretrained/WhisperForConditionalGeneration.php b/src/Models/Pretrained/WhisperForConditionalGeneration.php
@@ -59,15 +59,19 @@ public function generate(
         // Whisper has additional options for returning timestamps
         $generationConfig['return_timestamps'] ??= false;
 
+
         if ($generationConfig['return_timestamps']) {
-            $logitsProcessor = [new WhisperTimeStampLogitsProcessor($generationConfig)];
+            $logitsProcessor = new LogitsProcessorList();
+            $logitsProcessor->push(new WhisperTimeStampLogitsProcessor($generationConfig));
         }
 
+
+
         if (isset($generationConfig['return_token_timestamps'])) {
-            $generationConfig->output_attentions = true;
-            $generationConfig->return_dict_in_generate = true;
+            $generationConfig['output_attentions'] = true;
+            $generationConfig['return_dict_in_generate'] = true;
 
-            if ($generationConfig['task'] === 'translate') {
+            if ($generationConfig['task'] ?? '' === 'translate') {
                 trigger_error("Token-level timestamps may not be reliable for task 'translate'.", E_USER_WARNING);
             }
 
@@ -79,13 +83,14 @@ public function generate(
             }
         }
 
+
         $outputs = parent::generate($inputs, $generationConfig, $logitsProcessor, $inputsAttentionMask, $streamer);
 
         if (isset($generationConfig['return_token_timestamps']) && isset($generationConfig['alignment_heads'])) {
             $outputs['token_timestamps'] = $this->extractTokenTimestamps(
                 $outputs,
                 $generationConfig['alignment_heads'],
-                $generationConfig['num_frames']
+                $generationConfig['num_frames'] ?? null,
             );
         }
 
@@ -106,9 +111,10 @@ public function generate(
     public function extractTokenTimestamps(
         array $generateOutputs,
         array $alignmentHeads,
-        ?int $numFrames = null,
+        int|float|null $numFrames = null,
         float $timePrecision = 0.02
     ): Tensor {
+        $numFrames = (int) $numFrames;
         if (!isset($generateOutputs['cross_attentions'])) {
             throw new Exception(
                 "Model outputs must contain cross attentions to extract timestamps. " .
@@ -125,18 +131,22 @@ public function extractTokenTimestamps(
         $batchedMatrices = array_map(function($batch) use ($numFrames, $alignmentHeads, $medianFilterWidth) {
             // Create a list with `decoder_layers` elements, each a tensor of shape
             // (batch size, attention_heads, output length, input length).
+            /** @var Tensor[] $crossAttentions */
             $crossAttentions = [];
             for ($i = 0; $i < $this->config['decoder_layers']; $i++) {
-                $crossAttentions[] = cat(array_map(fn($x) => $x[$i], $batch), 2);
+                $crossAttentions[] = Tensor::concat(array_map(fn($x) => $x[$i], $batch), 2);
             }
 
-            $weights = stack(array_map(function($alignmentHead) use ($crossAttentions, $numFrames) {
-                list($l, $h) = $alignmentHead;
+            $weights = Tensor::stack(array_map(function($alignmentHead) use ($crossAttentions, $numFrames) {
+                [$l, $h] = $alignmentHead;
                 return $numFrames
                     ? $crossAttentions[$l]->slice(null, $h, null, [0, $numFrames])
                     : $crossAttentions[$l]->slice(null, $h);
             }, $alignmentHeads));
-            $weights = $weights->transpose(1, 0, 2, 3);
+            dd($weights->shape());
+
+            $weights = $weights->permute( 1, 0, 2, 3);
+
 
             list($std, $calculatedMean) = std_mean($weights, -2, 0, true);
 
diff --git a/src/Pipelines/AutomaticSpeechRecognitionPipeline.php b/src/Pipelines/AutomaticSpeechRecognitionPipeline.php
@@ -105,7 +105,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
         $strideLengthSecs = $args['strideLengthSecs'] ?? null;
 
         if ($returnTimestamps == 'word') {
-            $args['returnTimestamps'] = true;
+            $args['return_token_timestamps'] = true;
         }
 
         $language = array_pop_key($args, 'language');
@@ -150,7 +150,6 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
 
             $chunks = [];
 
-
             if ($chunkLengthSecs > 0) {
 
                 if ($strideLengthSecs === null) {
@@ -164,12 +163,18 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
                 $jump = $window - 2 * $stride;
                 $offset = 0;
 
+
                 while ($offset < $audioTensor->size()) {
-                    $subAudio = $audioTensor->slice($offset, $offset + $window);
+
+                    if ($offset + $window > $audioTensor->size()) {
+                        $window = $audioTensor->size() - $offset;
+                    }
+
+                    $subAudio = $audioTensor->sliceWithBounds([$offset], [$window]);
                     $feature = ($this->processor)($subAudio);
 
                     $isFirstChunk = $offset === 0;
-                    $isLastChunk = $offset + $window >= $audioTensor->size();
+                    $isLastChunk = $offset + $jump >= $audioTensor->size();
 
                     $chunks[] = [
                         'stride' => [
@@ -194,7 +199,6 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
 
             }
 
-
             // Generate for each set of input features
             foreach ($chunks as &$chunk) {
                 $generationConfig['num_frames'] = floor($chunk['stride'][0] / $hopLength);
@@ -203,7 +207,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
                 $data = $this->model->generate($chunk['input_features'], generationConfig: $generationConfig, streamer: $streamer);
 
                 // TODO: Right now we only get top beam
-                if ($returnTimestamps == 'word') {
+                if ($returnTimestamps === 'word') {
                     $chunk['tokens'] = $data['sequences'][0];
                     $chunk['token_timestamps'] = array_map(fn($x) => round($x, 2), $data['token_timestamps'][0]);
                 } else {
diff --git a/src/PretrainedTokenizers/PretrainedTokenizer.php b/src/PretrainedTokenizers/PretrainedTokenizer.php
@@ -539,6 +539,7 @@ public function batchDecode(array $batch, bool $skipSpecialTokens = false, ?bool
     public function decode(array $tokenIds, bool $skipSpecialTokens = false, ?bool $cleanUpTokenizationSpaces = null): string
     {
         if (empty($tokenIds) || !is_int($tokenIds[0])) {
+            dd($tokenIds);
             throw new Exception("token_ids must be a non-empty array of integers.");
         }
 
diff --git a/src/PretrainedTokenizers/WhisperTokenizer.php b/src/PretrainedTokenizers/WhisperTokenizer.php
@@ -157,7 +157,7 @@ public function decodeASR(
         bool        $forceFullSequences = true
     ): array
     {
-        // Set force_full_sequences=false if you want streaming
+        // Set forceFullSequences=false if you want streaming
         // TODO add support for `returnLanguage`
 
         // Internal method meant to only be used by ASR pipeline.
@@ -323,6 +323,7 @@ public function decodeASR(
                 }
             }
 
+//            dump($this->decode($currentTokens), empty($previousTokens) ? '': $this->decode($previousTokens[0]));
             if (isset($output['stride'])) {
                 [$chunkLen, $strideLeft, $strideRight] = $output['stride'];
                 $timeOffset += $chunkLen - $strideRight;
@@ -418,9 +419,11 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
             $rightSequence = $sequences[$i];
             $max = 0.0;
             $maxIndices = [$leftLength, $leftLength, 0, 0];
+//            dd($this->decode($leftSequence), $this->decode($rightSequence));
 
             $rightLength = count($rightSequence);
             for ($j = 1; $j < $leftLength + $rightLength; ++$j) {
+                // epsilon to favor long perfect matches
                 $eps = $j / 10000.0;
                 $leftStart = max(0, $leftLength - $j);
                 $leftStop = min($leftLength, $leftLength + $rightLength - $j);
@@ -430,10 +433,13 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
                 $right = array_slice($rightSequence, $rightStart, $rightStop - $rightStart);
 
                 if (count($left) !== count($right)) {
-                    throw new Exception("There is a bug within whisper `decode_asr` function, please report it. Dropping to prevent bad inference.");
+                    throw new Exception("There is a bug within whisper `decodeASR` function, please report it. Dropping to prevent bad inference.");
                 }
 
-                $matches = count(array_filter(array_map(fn($elem, $idx) => $elem === $right[$idx], $left, array_keys($left))));
+                $matches = count(array_filter(
+                        array_map(fn($elem, $idx) => $elem === $right[$idx], $left, array_keys($left))
+                    )
+                );
 
                 $matching = $matches / $j + $eps;
                 if ($matches > 1 && $matching > $max) {
@@ -443,8 +449,8 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
             }
 
             [$leftStart, $leftStop, $rightStart, $rightStop] = $maxIndices;
-            $leftMid = intval(($leftStop + $leftStart) / 2);
-            $rightMid = intval(($rightStop + $rightStart) / 2);
+            $leftMid = (int)floor(($leftStop + $leftStart) / 2);
+            $rightMid = (int)floor(($rightStop + $rightStart) / 2);
             $totalSequence = array_merge($totalSequence, array_slice($leftSequence, 0, $leftMid));
             $leftSequence = array_slice($rightSequence, $rightMid);
             $leftLength = count($leftSequence);
diff --git a/src/Tensor/Tensor.php b/src/Tensor/Tensor.php
diff --git a/src/Utils/GenerationConfig.php b/src/Utils/GenerationConfig.php

Original file line number	Diff line number	Diff line change
`@@ -43,11 +43,6 @@ public function extend(traversable $items): void`
`43`	`43`	`*/`
`44`	`44`	`public function __invoke(array $inputIds, Tensor &$batchedLogits): void`
`45`	`45`	`{`
`46`		`-// foreach ($batchedLogits as &$logits) {`
`47`		`-// foreach ($this->processors as $processor) {`
`48`		`-// $processor($inputIds, $logits); // Apply processors in-place`
`49`		`-// }`
`50`		`-// }`
`51`	`46`	`for ($i = 0; $i < count($batchedLogits); $i++) {`
`52`	`47`	`foreach ($this->processors as $processor) {`
`53`	`48`	`$processor($inputIds, $batchedLogits[$i]); // Apply processors in-place`
`@@ -62,7 +57,6 @@ public function __invoke(array $inputIds, Tensor &$batchedLogits): void`
`62`	`57`	`*/`
`63`	`58`	`public function getIterator(): Traversable`
`64`	`59`	`{`
`65`		`-// return new \ArrayIterator($this->processors);`
`66`	`60`	`yield from $this->processors;`
`67`	`61`	`}`
`68`	`62`	`}`
Original file line number	Diff line number	Diff line change
`@@ -770,7 +770,6 @@ protected function getGenerationConfig(?GenerationConfig $generationConfig): Gen`
`770`	`770`	`$genConfigArray = array_merge($genConfigArray, $this->generationConfig->toArray());`
`771`	`771`	`}`
`772`	`772`
`773`		`-`
`774`	`773`	`// Finally, use any generation config specified by the user`
`775`	`774`	// when calling `generate`
`776`	`775`	`if ($generationConfig !== null) {`
Original file line number	Diff line number	Diff line change
`@@ -539,6 +539,7 @@ public function batchDecode(array $batch, bool $skipSpecialTokens = false, ?bool`
`539`	`539`	`public function decode(array $tokenIds, bool $skipSpecialTokens = false, ?bool $cleanUpTokenizationSpaces = null): string`
`540`	`540`	`{`
`541`	`541`	`if (empty($tokenIds) \|\| !is_int($tokenIds[0])) {`
	`542`	`+ dd($tokenIds);`
`542`	`543`	`throw new Exception("token_ids must be a non-empty array of integers.");`
`543`	`544`	`}`
`544`	`545`