Skip to content

Commit cbfd758

Browse files
Seperate AutoModel class for CTC and bug fixes for WhisperTokenizer
1 parent 124c7f0 commit cbfd758

21 files changed

Lines changed: 269 additions & 38 deletions

examples/pipelines/asr.php

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111

1212
ini_set('memory_limit', '-1');
1313

14-
$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
14+
//$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
15+
$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-base');
16+
//$transcriber = pipeline('automatic-speech-recognition', 'Xenova/wav2vec2-large-xlsr-53-english');
1517

1618
$audioUrl = __DIR__ . '/../sounds/kyrian-dev.wav';
1719
//$audioUrl = __DIR__ . '/../sounds/jfk.wav';
@@ -24,6 +26,6 @@
2426
//$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
2527

2628
$streamer = StdOutStreamer::make();
27-
$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 18, returnTimestamps: 'word');
29+
$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 20, returnTimestamps: 'word');
2830

2931
dd($output, timeUsage(), memoryUsage());
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
6+
namespace Codewithkyrian\Transformers\FeatureExtractors;
7+
8+
use Codewithkyrian\Transformers\Tensor\Tensor;
9+
use function Codewithkyrian\Transformers\Utils\timeUsage;
10+
11+
class Wav2Vec2FeatureExtractor extends FeatureExtractor
12+
{
13+
/**
14+
* Extracts features from a given audio using the provided configuration.
15+
* @param Tensor $waveform The audio tensor to extract features from.
16+
* @return Tensor[] The extracted features.
17+
*/
18+
public function __invoke(Tensor $waveform): array
19+
{
20+
// zero-mean and unit-variance normalization
21+
if ($this->config['do_normalize'])
22+
{
23+
$mean = $waveform->mean();
24+
25+
//calculate the variance
26+
// $variance = $waveform->add(-$mean)->pow(2)->mean();
27+
$variance = 0;
28+
for ($i = 0; $i < $waveform->size(); $i++) {
29+
$variance += pow($waveform[$i] - $mean, 2);
30+
}
31+
$variance /= $waveform->size();
32+
33+
//normalize the waveform
34+
$waveform = $waveform->add(-$mean)->multiply(1.0 / sqrt($variance + 1e-7));
35+
}
36+
37+
$shape = [1, $waveform->size()];
38+
39+
return [
40+
'input_values' => $waveform->reshape($shape),
41+
'attention_mask' => Tensor::ones($shape, dtype: Tensor::int64)
42+
];
43+
}
44+
}

src/Models/Auto/AutoModel.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ class AutoModel extends PretrainedMixin
2121
"siglip" => \Codewithkyrian\Transformers\Models\Pretrained\SigLipModel::class,
2222

2323
"audio-spectrogram-transformer" => \Codewithkyrian\Transformers\Models\Pretrained\ASTModel::class,
24+
"wav2vec2" => \Codewithkyrian\Transformers\Models\Pretrained\Wav2Vec2Model::class,
2425

2526
'detr' => \Codewithkyrian\Transformers\Models\Pretrained\DETRModel::class,
2627
'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YOLOSModel::class,

src/Models/Auto/AutoModelForAudioClassification.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ class AutoModelForAudioClassification extends PretrainedMixin
99
{
1010
const MODEL_CLASS_MAPPING = [
1111
'audio-spectrogram-transformer' => \Codewithkyrian\Transformers\Models\Pretrained\ASTForAudioClassification::class,
12+
'wav2vec2' => \Codewithkyrian\Transformers\Models\Pretrained\Wav2Vec2ForSequenceClassification::class,
1213
];
1314

1415
const MODEL_CLASS_MAPPINGS = [
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
6+
namespace Codewithkyrian\Transformers\Models\Auto;
7+
8+
class AutoModelForCTC extends PretrainedMixin
9+
{
10+
const MODEL_CLASS_MAPPING = [
11+
'wav2vec2' => \Codewithkyrian\Transformers\Models\Pretrained\Wav2Vec2ForCTC::class,
12+
];
13+
14+
const MODEL_CLASS_MAPPINGS = [
15+
self::MODEL_CLASS_MAPPING,
16+
];
17+
}

src/Models/Auto/PretrainedMixin.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ protected static function getModelArchitecture($modelClass): ModelArchitecture
104104
in_array($modelClass, AutoModelForImageClassification::MODEL_CLASS_MAPPING) => ModelArchitecture::EncoderOnly,
105105
in_array($modelClass, AutoModelForAudioClassification::MODEL_CLASS_MAPPING) => ModelArchitecture::EncoderOnly,
106106
in_array($modelClass, AutoModelForSpeechSeq2Seq::MODEL_CLASS_MAPPING) => ModelArchitecture::Seq2SeqLM,
107+
in_array($modelClass, AutoModelForCTC::MODEL_CLASS_MAPPING) => ModelArchitecture::EncoderOnly,
107108

108109
default => ModelArchitecture::EncoderOnly,
109110
};
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
6+
namespace Codewithkyrian\Transformers\Models\Output;
7+
8+
use Codewithkyrian\Transformers\Tensor\Tensor;
9+
10+
class CasualLMOutput implements ModelOutput
11+
{
12+
public function __construct(public readonly Tensor $logits)
13+
{
14+
}
15+
16+
public static function fromOutput(array $array): self
17+
{
18+
return new self($array['logits']);
19+
}
20+
}
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
6+
namespace Codewithkyrian\Transformers\Models\Pretrained;
7+
8+
use Codewithkyrian\Transformers\Models\Output\ModelOutput;
9+
use Codewithkyrian\Transformers\Models\Output\TokenClassifierOutput;
10+
11+
/**
12+
* Wav2Vec2 Model with a frame classification head on top for tasks like Speaker Diarization.
13+
*/
14+
class Wav2Vec2ForAudioFrameClassification extends Wav2Vec2PretrainedModel
15+
{
16+
public function __invoke(array $modelInputs): array|ModelOutput
17+
{
18+
return TokenClassifierOutput::fromOutput(parent::__invoke($modelInputs));
19+
}
20+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
6+
namespace Codewithkyrian\Transformers\Models\Pretrained;
7+
8+
use Codewithkyrian\Transformers\Models\Output\CasualLMOutput;
9+
use Codewithkyrian\Transformers\Models\Output\ModelOutput;
10+
11+
class Wav2Vec2ForCTC extends Wav2Vec2PretrainedModel
12+
{
13+
// public function __invoke(array $modelInputs): array|ModelOutput
14+
// {
15+
// return CasualLMOutput::fromOutput(parent::__invoke($modelInputs));
16+
// }
17+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
6+
namespace Codewithkyrian\Transformers\Models\Pretrained;
7+
8+
use Codewithkyrian\Transformers\Models\Output\ModelOutput;
9+
use Codewithkyrian\Transformers\Models\Output\SequenceClassifierOutput;
10+
11+
class Wav2Vec2ForSequenceClassification extends Wav2Vec2PretrainedModel
12+
{
13+
public function __invoke(array $modelInputs): array|ModelOutput
14+
{
15+
return SequenceClassifierOutput::fromOutput(parent::__invoke($modelInputs));
16+
}
17+
}

0 commit comments

Comments
 (0)