CodeWithKyrian
diff --git a/‎examples/pipelines/asr.php‎
Lines changed: 11 additions & 5 deletions b/‎examples/pipelines/asr.php‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎examples/sounds/french-audio.mp3‎
97.6 KB b/‎examples/sounds/french-audio.mp3‎
97.6 KB
diff --git a/‎examples/sounds/french-audio.wav‎
538 KB b/‎examples/sounds/french-audio.wav‎
538 KB
diff --git a/‎examples/sounds/kyrian-dev.wav‎
616 KB b/‎examples/sounds/kyrian-dev.wav‎
616 KB
diff --git a/‎examples/sounds/kyrian-speaking.wav‎
4.97 MB b/‎examples/sounds/kyrian-speaking.wav‎
4.97 MB
diff --git a/‎examples/sounds/ted_60.wav‎
11 MB b/‎examples/sounds/ted_60.wav‎
11 MB
diff --git a/‎src/Generation/LogitsProcessors/ForceTokensLogitsProcessor.php‎
Lines changed: 2 additions & 3 deletions b/‎src/Generation/LogitsProcessors/ForceTokensLogitsProcessor.php‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/Generation/LogitsProcessors/WhisperTimeStampLogitsProcessor.php‎
Lines changed: 8 additions & 7 deletions b/‎src/Generation/LogitsProcessors/WhisperTimeStampLogitsProcessor.php‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/Generation/Streamers/TextStreamer.php‎
Lines changed: 5 additions & 0 deletions b/‎src/Generation/Streamers/TextStreamer.php‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/Models/Pretrained/WhisperForConditionalGeneration.php‎
Lines changed: 41 additions & 23 deletions b/‎src/Models/Pretrained/WhisperForConditionalGeneration.php‎
Lines changed: 41 additions & 23 deletions
@@ -12,20 +12,26 @@
 ini_set('memory_limit', '-1');
 
 //$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
-$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-base');
+$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny');
+//$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-base');
 //$transcriber = pipeline('automatic-speech-recognition', 'Xenova/wav2vec2-large-xlsr-53-english');
 
 $audioUrl = __DIR__ . '/../sounds/kyrian-dev.wav';
 //$audioUrl = __DIR__ . '/../sounds/jfk.wav';
 //$audioUrl = __DIR__ . '/../sounds/preamble.wav';
 //$audioUrl = __DIR__ . '/../sounds/taunt.wav';
 //$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
-//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
 //$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
-//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking2.wav';
-//$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
+//$audioUrl = __DIR__ . '/../sounds/ted_60.wav';
+$audioUrl = __DIR__ . '/../sounds/french-audio.wav';
 
 $streamer = StdOutStreamer::make();
-$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 20, returnTimestamps: 'word');
+$output = $transcriber($audioUrl,
+    maxNewTokens: 256,
+    chunkLengthSecs: 24,
+    task: 'translate'
+//    returnTimestamps: true,
+//    streamer: $streamer
+);
 
 dd($output, timeUsage(), memoryUsage());
@@ -6,6 +6,7 @@
 namespace Codewithkyrian\Transformers\Generation\LogitsProcessors;
 
 use Codewithkyrian\Transformers\Tensor\Tensor;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
 
 class ForceTokensLogitsProcessor extends LogitsProcessor
 {
@@ -15,9 +16,7 @@ class ForceTokensLogitsProcessor extends LogitsProcessor
 
     public function __construct(array $forcedDecoderIds)
     {
-        foreach ($forcedDecoderIds[0] as $inputLength => $forcedId) {
-            $this->forceTokenMap[$inputLength] = $forcedId;
-        }
+        $this->forceTokenMap = array_column($forcedDecoderIds, 1, 0);
     }
 
     /**
 
@@ -7,6 +7,7 @@
 
 use Codewithkyrian\Transformers\Tensor\Tensor;
 use Codewithkyrian\Transformers\Utils\GenerationConfig;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
 
 class WhisperTimeStampLogitsProcessor extends LogitsProcessor
 {
@@ -40,7 +41,7 @@ class WhisperTimeStampLogitsProcessor extends LogitsProcessor
      */
     public function __construct(GenerationConfig $generateConfig)
     {
-        $this->eosTokenId = $generateConfig->eos_token_id;
+        $this->eosTokenId = $generateConfig['eos_token_id'];
         $this->noTimestampsTokenId = $generateConfig['no_timestamps_token_id'];
         $this->timestampBegin = $this->noTimestampsTokenId + 1;
 
@@ -79,11 +80,11 @@ public function __invoke(array $inputIds, Tensor $logits): Tensor
         if ($lastWasTimestamp) {
             if ($penultimateWasTimestamp) { // has to be non-timestamp
                 for ($i = $this->timestampBegin; $i < $logits->size(); $i++) {
-                    $logitsData[$i] = -INF;
+                    $logits->buffer()[$i] = -INF;
                 }
             } else { // cannot be normal text tokens
                 for ($i = 0; $i < $this->eosTokenId; $i++) {
-                    $logitsData[$i] = -INF;
+                    $logits->buffer()[$i] = -INF;
                 }
             }
         }
@@ -92,19 +93,19 @@ public function __invoke(array $inputIds, Tensor $logits): Tensor
         if (count($inputIds) === $this->beginIndex && $this->maxInitialTimestampIndex !== null) {
             $lastAllowed = $this->timestampBegin + $this->maxInitialTimestampIndex;
             for ($i = $lastAllowed + 1; $i < $logits->size(); $i++) {
-                $logitsData[$i] = -INF;
+                $logits->buffer()[$i] = -INF;
             }
         }
 
         // if sum of probability over timestamps is above any other token, sample timestamp
         $logProbs = $logits->softmax()->log();
-        $a = $logProbs->sliceWithBounds([0, $this->timestampBegin], [1, $logProbs->size() - $this->timestampBegin]);
-        $timestampLogProb = log($a->exp()->sum());
+        $timestampProbs = $logProbs->sliceWithBounds([0, $this->timestampBegin], [1, $logProbs->size() - $this->timestampBegin]);
+        $timestampLogProb = log($timestampProbs->exp()->sum());
         $maxTextTokenLogProb = $logProbs->sliceWithBounds([0, 0], [1, $this->timestampBegin])->max();
 
         if ($timestampLogProb > $maxTextTokenLogProb) {
             for ($i = 0; $i < $this->timestampBegin; $i++) {
-                $logitsData[$i] = -INF;
+                $logits->buffer()[$i] = -INF;
             }
         }
 
 
@@ -148,6 +148,11 @@ public function put(mixed $value): void
 
         $tokensToDecode = array_slice($value[0]['output_token_ids'], $this->lastDecodedCheckpointForToken);
 
+        if (empty($tokensToDecode))
+        {
+            return;
+        }
+
         $decodedText = $this->tokenizer->decode($tokensToDecode, skipSpecialTokens: true);
 
         // Check for punctuation marks indicating the end of a word or sentence
 
@@ -60,9 +60,8 @@ public function generate(
         // Whisper has additional options for returning timestamps
         $generationConfig['return_timestamps'] ??= false;
 
-
         if ($generationConfig['return_timestamps']) {
-            $logitsProcessor = new LogitsProcessorList();
+            $logitsProcessor ??= new LogitsProcessorList();
             $logitsProcessor->push(new WhisperTimeStampLogitsProcessor($generationConfig));
         }
 
@@ -83,14 +82,13 @@ public function generate(
             }
         }
 
-
-        $outputs = parent::generate($inputs, $generationConfig, $logitsProcessor, $inputsAttentionMask, $streamer);
+        $outputs = parent::generate($inputs, $generationConfig, $logitsProcessor, streamer: $streamer);
 
         if (isset($generationConfig['return_token_timestamps']) && isset($generationConfig['alignment_heads'])) {
             $outputs['token_timestamps'] = $this->extractTokenTimestamps(
                 $outputs,
                 $generationConfig['alignment_heads'],
-                (int)$generationConfig['num_frames'] ?? null,
+                $generationConfig['num_frames'] ?? null,
             );
         }
 
@@ -109,10 +107,10 @@ public function generate(
      * @throws Exception If the model outputs do not contain cross attentions
      */
     public function extractTokenTimestamps(
-        array          $generateOutputs,
-        array          $alignmentHeads,
+        array    $generateOutputs,
+        array    $alignmentHeads,
         int|null $numFrames = null,
-        float          $timePrecision = 0.02
+        float    $timePrecision = 0.02
     ): Tensor
     {
         if (!isset($generateOutputs['cross_attentions'])) {
@@ -128,6 +126,7 @@ public function extractTokenTimestamps(
             $medianFilterWidth = 7;
         }
 
+
         $batchedMatrices = array_map(function ($batch) use ($numFrames, $alignmentHeads, $medianFilterWidth) {
             // Create a list with `decoder_layers` elements, each a tensor of shape
             // (batch size, attention_heads, output length, input length).
@@ -164,13 +163,18 @@ public function extractTokenTimestamps(
                         /** @var Tensor $cTensor */
                         $cTensor = $bTensor[$c]; // [1500]
 
-                        $cTensor
-                            ->add($meanTensor->multiply(-1))
-                            ->multiply($stdTensor->reciprocal())
-                            ->copyTo($cTensor);
+                        for ($d = 0; $d < $cTensor->count(); ++$d) {
+                            $cTensor[$d] = ($cTensor[$d] - $meanTensor[$d]) / $stdTensor[$d];
+                        }
 
                         // Apply median filter.
                         $this->medianFilter($cTensor, $medianFilterWidth)->copyTo($cTensor);
+//                        $filtered = $this->medianFilter($cTensor, $medianFilterWidth);
+//                        for ($e = 0; $e < $filtered->count(); ++$e) {
+//                            $cTensor[$e] = $filtered[$e];
+//                        }
+
+
                     }
                 }
             }
@@ -181,7 +185,6 @@ public function extractTokenTimestamps(
 
         $timestampsShape = [count($generateOutputs['sequences']), count($generateOutputs['sequences'][0])];
 
-
         $timestamps = Tensor::zeros($timestampsShape, Tensor::float32);
 
         // Perform dynamic time warping on each element of the batch.
@@ -194,14 +197,13 @@ public function extractTokenTimestamps(
             $diffs = array_map(fn($i) => $textIndices[$i + 1] - $textIndices[$i], range(0, count($textIndices) - 2));
             $jumps = array_map(fn($x) => (bool)$x, array_merge([1], $diffs));
 
-            dd($timeIndices);
             $jumpTimes = [];
             for ($i = 0; $i < count($jumps); ++$i) {
                 if ($jumps[$i]) {
                     $jumpTimes[] = $timeIndices[$i] * $timePrecision;
                 }
             }
-            dd($jumpTimes);
+
             for ($i = 1; $i < count($jumpTimes); ++$i) {
                 $timestamps[$batchIdx][$i] = $jumpTimes[$i];
             }
@@ -210,38 +212,54 @@ public function extractTokenTimestamps(
         return $timestamps;
     }
 
-    function medianFilter(Tensor $tensor, int $windowSize): Tensor
+    /**
+     * Applies a median filter of width `$windowSize` along the last dimension of the input.
+     *
+     * The `$input` tensor is assumed to be 3- or 4-dimensional.
+     * @param Tensor $input
+     * @param int $windowSize
+     * @return Tensor
+     */
+    function medianFilter(Tensor $input, int $windowSize): Tensor
     {
         if ($windowSize % 2 === 0 || $windowSize <= 0) {
             throw new InvalidArgumentException('Window size must be a positive odd number');
         }
 
-        $outputArray = array_fill(0, count($tensor), 0);
+        $output = Tensor::fill($input->shape(), 0, $input->dtype());
         $buffer = array_fill(0, $windowSize, 0);
 
         $halfWindowSize = (int)floor($windowSize / 2);
 
-        for ($i = 0; $i < count($tensor); ++$i) {
+        for ($i = 0; $i < count($input); ++$i) {
             $valuesIndex = 0;
 
             for ($j = -$halfWindowSize; $j <= $halfWindowSize; ++$j) {
                 $index = $i + $j;
                 if ($index < 0) {
                     $index = abs($index);
-                } else if ($index >= count($tensor)) {
-                    $index = 2 * (count($tensor) - 1) - $index;
+                } else if ($index >= count($input)) {
+                    $index = 2 * (count($input) - 1) - $index;
                 }
 
-                $buffer[$valuesIndex++] = $tensor->buffer()[$index];
+                $buffer[$valuesIndex++] = $input[$index];
             }
 
             sort($buffer);
-            $outputArray[$i] = $buffer[$halfWindowSize];
+
+            $output->buffer()[$i] = $buffer[$halfWindowSize];
         }
 
-        return Tensor::fromArray($outputArray, $tensor->dtype());
+        return $output;
     }
 
+    /**
+     * Measures
+     * similarity between two temporal sequences: the input audio and the output tokens. Used to generate
+     * token-level timestamps.
+     * @param Tensor $tensor
+     * @return array
+     */
     private function dynamicTimeWarping(Tensor $tensor): array
     {
         [$outputLength, $inputLength] = $tensor->shape();
Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@`
`6`	`6`	`namespace Codewithkyrian\Transformers\Generation\LogitsProcessors;`
`7`	`7`
`8`	`8`	`use Codewithkyrian\Transformers\Tensor\Tensor;`
	`9`	`+use function Codewithkyrian\Transformers\Utils\timeUsage;`
`9`	`10`
`10`	`11`	`class ForceTokensLogitsProcessor extends LogitsProcessor`
`11`	`12`	`{`
`@@ -15,9 +16,7 @@ class ForceTokensLogitsProcessor extends LogitsProcessor`
`15`	`16`
`16`	`17`	`public function __construct(array $forcedDecoderIds)`
`17`	`18`	`{`
`18`		`- foreach ($forcedDecoderIds[0] as $inputLength => $forcedId) {`
`19`		`- $this->forceTokenMap[$inputLength] = $forcedId;`
`20`		`- }`
	`19`	`+ $this->forceTokenMap = array_column($forcedDecoderIds, 1, 0);`
`21`	`20`	`}`
`22`	`21`
`23`	`22`	`/**`
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`
`8`	`8`	`use Codewithkyrian\Transformers\Tensor\Tensor;`
`9`	`9`	`use Codewithkyrian\Transformers\Utils\GenerationConfig;`
	`10`	`+use function Codewithkyrian\Transformers\Utils\timeUsage;`
`10`	`11`
`11`	`12`	`class WhisperTimeStampLogitsProcessor extends LogitsProcessor`
`12`	`13`	`{`
`@@ -40,7 +41,7 @@ class WhisperTimeStampLogitsProcessor extends LogitsProcessor`
`40`	`41`	`*/`
`41`	`42`	`public function __construct(GenerationConfig $generateConfig)`
`42`	`43`	`{`
`43`		`- $this->eosTokenId = $generateConfig->eos_token_id;`
	`44`	`+ $this->eosTokenId = $generateConfig['eos_token_id'];`
`44`	`45`	`$this->noTimestampsTokenId = $generateConfig['no_timestamps_token_id'];`
`45`	`46`	`$this->timestampBegin = $this->noTimestampsTokenId + 1;`
`46`	`47`
`@@ -79,11 +80,11 @@ public function __invoke(array $inputIds, Tensor $logits): Tensor`
`79`	`80`	`if ($lastWasTimestamp) {`
`80`	`81`	`if ($penultimateWasTimestamp) { // has to be non-timestamp`
`81`	`82`	`for ($i = $this->timestampBegin; $i < $logits->size(); $i++) {`
`82`		`- $logitsData[$i] = -INF;`
	`83`	`+ $logits->buffer()[$i] = -INF;`
`83`	`84`	`}`
`84`	`85`	`} else { // cannot be normal text tokens`
`85`	`86`	`for ($i = 0; $i < $this->eosTokenId; $i++) {`
`86`		`- $logitsData[$i] = -INF;`
	`87`	`+ $logits->buffer()[$i] = -INF;`
`87`	`88`	`}`
`88`	`89`	`}`
`89`	`90`	`}`
`@@ -92,19 +93,19 @@ public function __invoke(array $inputIds, Tensor $logits): Tensor`
`92`	`93`	`if (count($inputIds) === $this->beginIndex && $this->maxInitialTimestampIndex !== null) {`
`93`	`94`	`$lastAllowed = $this->timestampBegin + $this->maxInitialTimestampIndex;`
`94`	`95`	`for ($i = $lastAllowed + 1; $i < $logits->size(); $i++) {`
`95`		`- $logitsData[$i] = -INF;`
	`96`	`+ $logits->buffer()[$i] = -INF;`
`96`	`97`	`}`
`97`	`98`	`}`
`98`	`99`
`99`	`100`	`// if sum of probability over timestamps is above any other token, sample timestamp`
`100`	`101`	`$logProbs = $logits->softmax()->log();`
`101`		`- $a = $logProbs->sliceWithBounds([0, $this->timestampBegin], [1, $logProbs->size() - $this->timestampBegin]);`
`102`		`- $timestampLogProb = log($a->exp()->sum());`
	`102`	`+ $timestampProbs = $logProbs->sliceWithBounds([0, $this->timestampBegin], [1, $logProbs->size() - $this->timestampBegin]);`
	`103`	`+ $timestampLogProb = log($timestampProbs->exp()->sum());`
`103`	`104`	`$maxTextTokenLogProb = $logProbs->sliceWithBounds([0, 0], [1, $this->timestampBegin])->max();`
`104`	`105`
`105`	`106`	`if ($timestampLogProb > $maxTextTokenLogProb) {`
`106`	`107`	`for ($i = 0; $i < $this->timestampBegin; $i++) {`
`107`		`- $logitsData[$i] = -INF;`
	`108`	`+ $logits->buffer()[$i] = -INF;`
`108`	`109`	`}`
`109`	`110`	`}`
`110`	`111`