Seperate AutoModel class for CTC and bug fixes for WhisperTokenizer

CodeWithKyrian · CodeWithKyrian · commit cbfd75899f3a · 2024-08-01T22:59:16.000+01:00
diff --git a/examples/pipelines/asr.php b/examples/pipelines/asr.php
@@ -11,7 +11,9 @@
 
 ini_set('memory_limit', '-1');
 
-$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
+//$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
+$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-base');
+//$transcriber = pipeline('automatic-speech-recognition', 'Xenova/wav2vec2-large-xlsr-53-english');
 
 $audioUrl = __DIR__ . '/../sounds/kyrian-dev.wav';
 //$audioUrl = __DIR__ . '/../sounds/jfk.wav';
@@ -24,6 +26,6 @@
 //$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
 
 $streamer = StdOutStreamer::make();
-$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 18,  returnTimestamps: 'word');
+$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 20, returnTimestamps: 'word');
 
 dd($output, timeUsage(), memoryUsage());
diff --git a/src/FeatureExtractors/Wav2Vec2FeatureExtractor.php b/src/FeatureExtractors/Wav2Vec2FeatureExtractor.php
@@ -0,0 +1,44 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\FeatureExtractors;
+
+use Codewithkyrian\Transformers\Tensor\Tensor;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
+
+class Wav2Vec2FeatureExtractor extends FeatureExtractor
+{
+    /**
+     *  Extracts features from a given audio using the provided configuration.
+     * @param Tensor $waveform The audio tensor to extract features from.
+     * @return Tensor[] The extracted features.
+     */
+    public function __invoke(Tensor $waveform): array
+    {
+        // zero-mean and unit-variance normalization
+        if ($this->config['do_normalize'])
+        {
+            $mean = $waveform->mean();
+
+            //calculate the variance
+//            $variance = $waveform->add(-$mean)->pow(2)->mean();
+            $variance = 0;
+            for ($i = 0; $i < $waveform->size(); $i++) {
+                $variance += pow($waveform[$i] - $mean, 2);
+            }
+            $variance /= $waveform->size();
+
+            //normalize the waveform
+            $waveform = $waveform->add(-$mean)->multiply(1.0 / sqrt($variance + 1e-7));
+        }
+
+        $shape = [1, $waveform->size()];
+
+        return [
+            'input_values' => $waveform->reshape($shape),
+            'attention_mask' => Tensor::ones($shape, dtype: Tensor::int64)
+        ];
+    }
+}
diff --git a/src/Models/Auto/AutoModel.php b/src/Models/Auto/AutoModel.php
@@ -21,6 +21,7 @@ class AutoModel extends PretrainedMixin
         "siglip" => \Codewithkyrian\Transformers\Models\Pretrained\SigLipModel::class,
 
         "audio-spectrogram-transformer" => \Codewithkyrian\Transformers\Models\Pretrained\ASTModel::class,
+        "wav2vec2" => \Codewithkyrian\Transformers\Models\Pretrained\Wav2Vec2Model::class,
 
         'detr' => \Codewithkyrian\Transformers\Models\Pretrained\DETRModel::class,
         'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YOLOSModel::class,
diff --git a/src/Models/Auto/AutoModelForAudioClassification.php b/src/Models/Auto/AutoModelForAudioClassification.php
@@ -9,6 +9,7 @@ class AutoModelForAudioClassification  extends PretrainedMixin
 {
     const MODEL_CLASS_MAPPING = [
         'audio-spectrogram-transformer' => \Codewithkyrian\Transformers\Models\Pretrained\ASTForAudioClassification::class,
+        'wav2vec2' => \Codewithkyrian\Transformers\Models\Pretrained\Wav2Vec2ForSequenceClassification::class,
     ];
 
     const MODEL_CLASS_MAPPINGS = [
diff --git a/src/Models/Auto/AutoModelForCTC.php b/src/Models/Auto/AutoModelForCTC.php
@@ -0,0 +1,17 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Auto;
+
+class AutoModelForCTC extends PretrainedMixin
+{
+    const MODEL_CLASS_MAPPING = [
+        'wav2vec2' => \Codewithkyrian\Transformers\Models\Pretrained\Wav2Vec2ForCTC::class,
+    ];
+
+    const MODEL_CLASS_MAPPINGS = [
+        self::MODEL_CLASS_MAPPING,
+    ];
+}
diff --git a/src/Models/Auto/PretrainedMixin.php b/src/Models/Auto/PretrainedMixin.php
@@ -104,6 +104,7 @@ protected static function getModelArchitecture($modelClass): ModelArchitecture
             in_array($modelClass, AutoModelForImageClassification::MODEL_CLASS_MAPPING) => ModelArchitecture::EncoderOnly,
             in_array($modelClass, AutoModelForAudioClassification::MODEL_CLASS_MAPPING) => ModelArchitecture::EncoderOnly,
             in_array($modelClass, AutoModelForSpeechSeq2Seq::MODEL_CLASS_MAPPING) => ModelArchitecture::Seq2SeqLM,
+            in_array($modelClass, AutoModelForCTC::MODEL_CLASS_MAPPING) => ModelArchitecture::EncoderOnly,
 
             default => ModelArchitecture::EncoderOnly,
         };
diff --git a/src/Models/Output/CasualLMOutput.php b/src/Models/Output/CasualLMOutput.php
@@ -0,0 +1,20 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Output;
+
+use Codewithkyrian\Transformers\Tensor\Tensor;
+
+class CasualLMOutput implements ModelOutput
+{
+    public function __construct(public readonly Tensor $logits)
+    {
+    }
+
+    public static function fromOutput(array $array): self
+    {
+        return new self($array['logits']);
+    }
+}
diff --git a/src/Models/Pretrained/Wav2Vec2ForAudioFrameClassification.php b/src/Models/Pretrained/Wav2Vec2ForAudioFrameClassification.php
@@ -0,0 +1,20 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\Output\ModelOutput;
+use Codewithkyrian\Transformers\Models\Output\TokenClassifierOutput;
+
+/**
+ * Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization.
+ */
+class Wav2Vec2ForAudioFrameClassification extends Wav2Vec2PretrainedModel
+{
+public function __invoke(array $modelInputs): array|ModelOutput
+{
+    return TokenClassifierOutput::fromOutput(parent::__invoke($modelInputs));
+}
+}
diff --git a/src/Models/Pretrained/Wav2Vec2ForCTC.php b/src/Models/Pretrained/Wav2Vec2ForCTC.php
@@ -0,0 +1,17 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\Output\CasualLMOutput;
+use Codewithkyrian\Transformers\Models\Output\ModelOutput;
+
+class Wav2Vec2ForCTC extends Wav2Vec2PretrainedModel
+{
+//    public function __invoke(array $modelInputs): array|ModelOutput
+//    {
+//        return CasualLMOutput::fromOutput(parent::__invoke($modelInputs));
+//    }
+}
diff --git a/src/Models/Pretrained/Wav2Vec2ForSequenceClassification.php b/src/Models/Pretrained/Wav2Vec2ForSequenceClassification.php
@@ -0,0 +1,17 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\Output\ModelOutput;
+use Codewithkyrian\Transformers\Models\Output\SequenceClassifierOutput;
+
+class Wav2Vec2ForSequenceClassification extends Wav2Vec2PretrainedModel
+{
+    public function __invoke(array $modelInputs): array|ModelOutput
+    {
+        return SequenceClassifierOutput::fromOutput(parent::__invoke($modelInputs));
+    }
+}
diff --git a/src/Models/Pretrained/Wav2Vec2Model.php b/src/Models/Pretrained/Wav2Vec2Model.php
@@ -0,0 +1,37 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+/**
+ * The bare Wav2Vec2 Model transformer outputting raw hidden-states without any specific head on top.
+ *
+ * **Example:** Load and run a `Wav2Vec2Model` for feature extraction.
+ *
+ * ```php
+ *
+ * // Read and preprocess audio
+ * $processor = AutoProcessor::fromPretrained('Xenova/mms-300m');
+ * $audio =  Audio::read('https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac');
+ * $audioTensor = $audio->toTensor(samplerate: 16000);
+ * $inputs = $processor($audioTensor);
+ *
+ * // Run model with inputs
+ * $model = AutoModel::from_pretrained('Xenova/mms-300m');
+ * $output = $model($inputs);
+ * // {
+ * //   last_hidden_state: Tensor {
+ * //     shape: [ 1, 1144, 1024 ],
+ * //     dtype: 'float32',
+ * //     buffer: (1171456) [ ... ],
+ * //     size: 1171456
+ * //   }
+ * // }
+ * ```
+ */
+class Wav2Vec2Model extends Wav2Vec2PretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/Wav2Vec2PretrainedModel.php b/src/Models/Pretrained/Wav2Vec2PretrainedModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class Wav2Vec2PretrainedModel extends PretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/WhisperForConditionalGeneration.php b/src/Models/Pretrained/WhisperForConditionalGeneration.php
@@ -90,7 +90,7 @@ public function generate(
             $outputs['token_timestamps'] = $this->extractTokenTimestamps(
                 $outputs,
                 $generationConfig['alignment_heads'],
-                $generationConfig['num_frames'] ?? null,
+                (int)$generationConfig['num_frames'] ?? null,
             );
         }
 
@@ -111,11 +111,10 @@ public function generate(
     public function extractTokenTimestamps(
         array          $generateOutputs,
         array          $alignmentHeads,
-        int|float|null $numFrames = null,
+        int|null $numFrames = null,
         float          $timePrecision = 0.02
     ): Tensor
     {
-        $numFrames = (int)$numFrames;
         if (!isset($generateOutputs['cross_attentions'])) {
             throw new Exception(
                 "Model outputs must contain cross attentions to extract timestamps. " .
@@ -145,9 +144,7 @@ public function extractTokenTimestamps(
                     : $crossAttentions[$l]->slice(null, $h); // experimental
             }, $alignmentHeads));
 
-            $weights = $weights
-                ->squeeze(1)
-                ->permute(1, 0, 2, 3);
+            $weights = $weights->squeeze(1)->permute(1, 0, 2, 3);
 
             [$std, $calculatedMean] = $weights->stdMean(-2, 0, true);
 
@@ -157,11 +154,9 @@ public function extractTokenTimestamps(
             for ($a = 0; $a < $smoothedWeights->shape()[0]; ++$a) {
                 $aTensor = $smoothedWeights[$a]; // [8, seqLength, 1500]
 
-
                 for ($b = 0; $b < $aTensor->shape()[0]; ++$b) {
                     $bTensor = $aTensor[$b]; // [seqLength, 1500]
 
-
                     $stdTensor = $std[$a][$b][0]; // [1500]
                     $meanTensor = $calculatedMean[$a][$b][0]; // [1500]
 
@@ -187,7 +182,7 @@ public function extractTokenTimestamps(
         $timestampsShape = [count($generateOutputs['sequences']), count($generateOutputs['sequences'][0])];
 
 
-        $timestamps = new Tensor(null, Tensor::float32, $timestampsShape);
+        $timestamps = Tensor::zeros($timestampsShape, Tensor::float32);
 
         // Perform dynamic time warping on each element of the batch.
         for ($batchIdx = 0; $batchIdx < $timestampsShape[0]; ++$batchIdx) {
@@ -199,14 +194,15 @@ public function extractTokenTimestamps(
             $diffs = array_map(fn($i) => $textIndices[$i + 1] - $textIndices[$i], range(0, count($textIndices) - 2));
             $jumps = array_map(fn($x) => (bool)$x, array_merge([1], $diffs));
 
+            dd($timeIndices);
             $jumpTimes = [];
             for ($i = 0; $i < count($jumps); ++$i) {
                 if ($jumps[$i]) {
                     $jumpTimes[] = $timeIndices[$i] * $timePrecision;
                 }
             }
-//            $timestamps->buffer()[$batchIdx] = array_merge([0.0], $jumpTimes);
-            for ($i = 0; $i < count($jumpTimes); ++$i) {
+            dd($jumpTimes);
+            for ($i = 1; $i < count($jumpTimes); ++$i) {
                 $timestamps[$batchIdx][$i] = $jumpTimes[$i];
             }
         }
@@ -248,25 +244,25 @@ function medianFilter(Tensor $tensor, int $windowSize): Tensor
 
     private function dynamicTimeWarping(Tensor $tensor): array
     {
-        [$rows, $cols] = $tensor->shape();
+        [$outputLength, $inputLength] = $tensor->shape();
 
-        $outputShape = [$rows + 1, $cols + 1];
+        $outputShape = [$outputLength + 1, $inputLength + 1];
 
-        $cost = Tensor::fill($outputShape, -INF, Tensor::float32);
+        $cost = Tensor::fill($outputShape, INF, Tensor::float32);
         $traceback = Tensor::fill($outputShape, -1, Tensor::int32);
 
         $cost[0][0] = 0;
 
-        for ($i = 1; $i < $rows + 1; ++$i) {
-            for ($j = 1; $j < $cols + 1; ++$j) {
+        for ($j = 1; $j < $inputLength + 1; ++$j) {
+            for ($i = 1; $i < $outputLength + 1; ++$i) {
                 $c0 = $cost[$i - 1][$j - 1];
                 $c1 = $cost[$i - 1][$j];
                 $c2 = $cost[$i][$j - 1];
 
-                if ($c0 <= $c1 && $c0 <= $c2) {
+                if ($c0 < $c1 && $c0 < $c2) {
                     $c = $c0;
                     $t = 0;
-                } else if ($c1 <= $c0 && $c1 <= $c2) {
+                } else if ($c1 < $c0 && $c1 < $c2) {
                     $c = $c1;
                     $t = 1;
                 } else {
@@ -280,15 +276,15 @@ private function dynamicTimeWarping(Tensor $tensor): array
         }
 
         // Traceback
-        $i = $rows;
-        $j = $cols;
+        $i = $outputLength;
+        $j = $inputLength;
 
         for ($k = 0; $k < $outputShape[1]; ++$k) {
-            $traceback[0][$k] = 2;
+            $traceback[0][$k] = 2; // trace[0, :] = 2
         }
 
         for ($k = 0; $k < $outputShape[0]; ++$k) {
-            $traceback[$k][0] = 1;
+            $traceback[$k][0] = 1; // trace[:, 0] = 1
         }
 
         $textIndices = [];
@@ -310,7 +306,6 @@ private function dynamicTimeWarping(Tensor $tensor): array
             }
         }
 
-
         $textIndices = array_reverse($textIndices);
         $timeIndices = array_reverse($timeIndices);
 
diff --git a/src/Pipelines/AudioClassificationPipeline.php b/src/Pipelines/AudioClassificationPipeline.php
@@ -55,8 +55,8 @@ public function __invoke(array|string $inputs, ...$args): array
             $audio = Audio::read($input);
             $audioTensor = $audio->toTensor(samplerate: $sampleRate);
 
-            $inputs = ($this->processor)($audioTensor);
-            $outputs = ($this->model)($inputs);
+            $processedInputs = ($this->processor)($audioTensor);
+            $outputs = ($this->model)($processedInputs);
 
             $logits = $outputs['logits'][0];
 
diff --git a/src/Pipelines/AutomaticSpeechRecognitionPipeline.php b/src/Pipelines/AutomaticSpeechRecognitionPipeline.php
diff --git a/src/Pipelines/Task.php b/src/Pipelines/Task.php
diff --git a/src/PretrainedTokenizers/PretrainedTokenizer.php b/src/PretrainedTokenizers/PretrainedTokenizer.php
diff --git a/src/PretrainedTokenizers/Wav2Vec2CTCTokenizer.php b/src/PretrainedTokenizers/Wav2Vec2CTCTokenizer.php
diff --git a/src/PretrainedTokenizers/WhisperTokenizer.php b/src/PretrainedTokenizers/WhisperTokenizer.php
diff --git a/src/Processors/Wav2Vec2ProcessorWithLM.php b/src/Processors/Wav2Vec2ProcessorWithLM.php
diff --git a/src/Tensor/Tensor.php b/src/Tensor/Tensor.php

Original file line number	Diff line number	Diff line change
`@@ -9,6 +9,7 @@ class AutoModelForAudioClassification extends PretrainedMixin`
`9`	`9`	`{`
`10`	`10`	`const MODEL_CLASS_MAPPING = [`
`11`	`11`	`'audio-spectrogram-transformer' => \Codewithkyrian\Transformers\Models\Pretrained\ASTForAudioClassification::class,`
	`12`	`+ 'wav2vec2' => \Codewithkyrian\Transformers\Models\Pretrained\Wav2Vec2ForSequenceClassification::class,`
`12`	`13`	`];`
`13`	`14`
`14`	`15`	`const MODEL_CLASS_MAPPINGS = [`