Add Audio Classification Pipeline and support for processing stereo audio

CodeWithKyrian · CodeWithKyrian · commit 52d896c2b3b5 · 2024-08-01T22:59:16.000+01:00
diff --git a/examples/pipelines/audio-classification.php b/examples/pipelines/audio-classification.php
@@ -0,0 +1,18 @@
+<?php
+
+declare(strict_types=1);
+
+use function Codewithkyrian\Transformers\Pipelines\pipeline;
+use function Codewithkyrian\Transformers\Utils\memoryUsage;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
+
+require_once './bootstrap.php';
+
+$classifier = pipeline('audio-classification', 'Xenova/ast-finetuned-audioset-10-10-0.4593');
+
+//$audioUrl = __DIR__ . '/../sounds/dog_barking.wav';
+$audioUrl = __DIR__ . '/../sounds/cat_meow.wav';
+
+$output = $classifier($audioUrl, topK: 4);
+
+dd($output, timeUsage(), memoryUsage());
diff --git a/src/Models/Auto/AutoModel.php b/src/Models/Auto/AutoModel.php
@@ -20,6 +20,8 @@ class AutoModel extends PretrainedMixin
         "deit" => \Codewithkyrian\Transformers\Models\Pretrained\DeiTModel::class,
         "siglip" => \Codewithkyrian\Transformers\Models\Pretrained\SigLipModel::class,
 
+        "audio-spectrogram-transformer" => \Codewithkyrian\Transformers\Models\Pretrained\ASTModel::class,
+
         'detr' => \Codewithkyrian\Transformers\Models\Pretrained\DETRModel::class,
         'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YOLOSModel::class,
         'owlvit' => \Codewithkyrian\Transformers\Models\Pretrained\OwlVitModel::class,
diff --git a/src/Models/Auto/AutoModelForAudioClassification.php b/src/Models/Auto/AutoModelForAudioClassification.php
@@ -0,0 +1,17 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Auto;
+
+class AutoModelForAudioClassification  extends PretrainedMixin
+{
+    const MODEL_CLASS_MAPPING = [
+        'audio-spectrogram-transformer' => \Codewithkyrian\Transformers\Models\Pretrained\ASTForAudioClassification::class,
+    ];
+
+    const MODEL_CLASS_MAPPINGS = [
+        self::MODEL_CLASS_MAPPING,
+    ];
+}
diff --git a/src/Models/Pretrained/ASTForAudioClassification.php b/src/Models/Pretrained/ASTForAudioClassification.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class ASTForAudioClassification extends ASTPretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/ASTModel.php b/src/Models/Pretrained/ASTModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class ASTModel extends ASTPretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/ASTPretrainedModel.php b/src/Models/Pretrained/ASTPretrainedModel.php
@@ -0,0 +1,14 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+/**
+ * Audio Spectrogram Transformer (AST) models
+ */
+class ASTPretrainedModel extends PretrainedModel
+{
+
+}
diff --git a/src/Pipelines/AudioClassificationPipeline.php b/src/Pipelines/AudioClassificationPipeline.php
@@ -0,0 +1,80 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Pipelines;
+
+use Codewithkyrian\Transformers\Utils\Audio;
+
+/**
+ * Audio classification pipeline using any `AutoModelForAudioClassification`.
+ * This pipeline predicts the class of a raw waveform or an audio file.
+ *
+ * *Example:** Perform audio classification with `Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech`.
+ *  ```php
+ *  $classifier = pipeline('audio-classification', 'Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech');
+ *  $url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jfk.wav';
+ *  $output = $classifier($url);
+ *  // [
+ *  //   [ label: 'male', score: 0.9981542229652405 ],
+ *  //   [ label: 'female', score: 0.001845747814513743 ]
+ *  // ]
+ *  ```
+ *
+ * *Example:** Perform audio classification with `Xenova/ast-finetuned-audioset-10-10-0.4593` and return top 4 results.
+ *  ```php
+ *  $classifier = await pipeline('audio-classification', 'Xenova/ast-finetuned-audioset-10-10-0.4593');
+ *  $url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/cat_meow.wav';
+ *  $output = $classifier($url, topK: 4);
+ *  // [
+ *  //   [ label: 'Meow', score: 0.5617874264717102 ],
+ *  //   [ label: 'Cat', score: 0.22365376353263855 ],
+ *  //   [ label: 'Domestic animals, pets', score: 0.1141069084405899 ],
+ *  //   [ label: 'Animal', score: 0.08985692262649536 ],
+ *  // ]
+ *  ```
+ */
+class AudioClassificationPipeline extends Pipeline
+{
+    public function __invoke(array|string $inputs, ...$args): array
+    {
+        $topK = $args["topK"] ?? 1;
+
+        $isBatched = is_array($inputs);
+
+        if (!$isBatched) {
+            $inputs = [$inputs];
+        }
+
+        $sampleRate = $this->processor->featureExtractor->config['sampling_rate'];
+        $id2label = $this->model->config['id2label'];
+        $toReturn = [];
+
+        foreach ($inputs as $input) {
+            $audio = Audio::read($input);
+            $audioTensor = $audio->toTensor(samplerate: $sampleRate);
+
+            $inputs = ($this->processor)($audioTensor);
+            $outputs = ($this->model)($inputs);
+
+            $logits = $outputs['logits'][0];
+
+            [$scores, $indices] = $logits->softmax()->topk($topK, true);
+
+            $values = [];
+
+            foreach ($indices as $i => $index) {
+                $values[] = ['label' => $id2label[$index], 'score' => $scores[$i]];
+            }
+
+            if ($topK === 1) {
+                $toReturn = array_merge($toReturn, $values);
+            } else {
+                $toReturn[] = $values;
+            }
+        }
+
+        return $isBatched || $topK === 1 ? $toReturn : $toReturn[0];
+    }
+}
diff --git a/src/Pipelines/Task.php b/src/Pipelines/Task.php
@@ -5,6 +5,7 @@
 namespace Codewithkyrian\Transformers\Pipelines;
 
 use Codewithkyrian\Transformers\Models\Auto\AutoModel;
+use Codewithkyrian\Transformers\Models\Auto\AutoModelForAudioClassification;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForCausalLM;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageClassification;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageFeatureExtraction;
@@ -49,6 +50,8 @@ enum Task: string
     case ObjectDetection = 'object-detection';
     case ZeroShotObjectDetection = 'zero-shot-object-detection';
 
+    case AudioClassification = 'audio-classification';
+
 
     public function pipeline(PretrainedModel $model, ?PretrainedTokenizer $tokenizer, ?Processor $processor): Pipeline
     {
@@ -89,6 +92,8 @@ public function pipeline(PretrainedModel $model, ?PretrainedTokenizer $tokenizer
             self::ObjectDetection => new ObjectDetectionPipeline($this, $model, $tokenizer, $processor),
 
             self::ZeroShotObjectDetection => new ZeroShotObjectDetectionPipeline($this, $model, $tokenizer, $processor),
+
+            self::AudioClassification => new AudioClassificationPipeline($this, $model, processor: $processor),
         };
     }
 
@@ -129,16 +134,18 @@ public function defaultModelName(): string
             self::ObjectDetection => 'Xenova/detr-resnet-50', // Original: 'facebook/detr-resnet-50',
 
             self::ZeroShotObjectDetection => 'Xenova/owlvit-base-patch32', // Original: 'google/owlvit-base-patch32',
+
+            self::AudioClassification => 'Xenova/wav2vec2-base-superb-ks', // Original: 'superb/wav2vec2-base-superb-ks',
         };
     }
 
     public function autoModel(
-        string           $modelNameOrPath,
-        bool             $quantized = true,
-        ?array           $config = null,
-        ?string          $cacheDir = null,
-        string           $revision = 'main',
-        ?string          $modelFilename = null,
+        string    $modelNameOrPath,
+        bool      $quantized = true,
+        ?array    $config = null,
+        ?string   $cacheDir = null,
+        string    $revision = 'main',
+        ?string   $modelFilename = null,
         ?callable $onProgress = null
     ): PretrainedModel
     {
@@ -176,13 +183,15 @@ public function autoModel(
             self::ObjectDetection => AutoModelForObjectDetection::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $onProgress),
 
             self::ZeroShotObjectDetection => AutoModelForZeroShotObjectDetection::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $onProgress),
+
+            self::AudioClassification => AutoModelForAudioClassification::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $onProgress),
         };
     }
 
     public function autoTokenizer(
-        string           $modelNameOrPath,
-        ?string          $cacheDir = null,
-        string           $revision = 'main',
+        string    $modelNameOrPath,
+        ?string   $cacheDir = null,
+        string    $revision = 'main',
         ?callable $onProgress = null
     ): ?PretrainedTokenizer
     {
@@ -191,7 +200,8 @@ public function autoTokenizer(
             self::ImageClassification,
             self::ImageToImage,
             self::ImageFeatureExtraction,
-            self::ObjectDetection => null,
+            self::ObjectDetection,
+            self::AudioClassification => null,
 
 
             self::SentimentAnalysis,
@@ -214,10 +224,10 @@ public function autoTokenizer(
     }
 
     public function autoProcessor(
-        string           $modelNameOrPath,
-        ?array           $config = null,
-        ?string          $cacheDir = null,
-        string           $revision = 'main',
+        string    $modelNameOrPath,
+        ?array    $config = null,
+        ?string   $cacheDir = null,
+        string    $revision = 'main',
         ?callable $onProgress = null
     ): ?Processor
     {
@@ -229,7 +239,8 @@ public function autoProcessor(
             self::ZeroShotImageClassification,
             self::ImageToImage,
             self::ObjectDetection,
-            self::ZeroShotObjectDetection => AutoProcessor::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $onProgress),
+            self::ZeroShotObjectDetection,
+            self::AudioClassification => AutoProcessor::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $onProgress),
 
 
             self::SentimentAnalysis,
diff --git a/src/Tensor/Tensor.php b/src/Tensor/Tensor.php
@@ -44,15 +44,15 @@ class Tensor implements NDArray, Countable, Serializable, IteratorAggregate
     protected Buffer $buffer;
 
     protected static $pack = [
-        NDArray::bool    => 'C',
-        NDArray::int8    => 'c',
-        NDArray::int16   => 's',
-        NDArray::int32   => 'l',
-        NDArray::int64   => 'q',
-        NDArray::uint8   => 'C',
-        NDArray::uint16  => 'S',
-        NDArray::uint32  => 'L',
-        NDArray::uint64  => 'Q',
+        NDArray::bool => 'C',
+        NDArray::int8 => 'c',
+        NDArray::int16 => 's',
+        NDArray::int32 => 'l',
+        NDArray::int64 => 'q',
+        NDArray::uint8 => 'C',
+        NDArray::uint16 => 'S',
+        NDArray::uint32 => 'L',
+        NDArray::uint64 => 'Q',
         //NDArray::float8  => 'N/A',
         //NDArray::float16 => 'N/A',
         NDArray::float32 => 'g',
@@ -414,7 +414,7 @@ public function toString(): string
      */
     public function toBufferArray(): array
     {
-        $fmt = self::$pack[$this->dtype].'*';
+        $fmt = self::$pack[$this->dtype] . '*';
 
         return array_values(unpack($fmt, $this->buffer->dump()));
     }
@@ -848,10 +848,16 @@ public function mean(?int $axis = null, bool $keepShape = false): static|float|i
     {
         $mo = self::mo();
 
+        if ($axis !== null) {
+            $axis = $this->safeIndex($axis, $this->ndim());
+        }
+
         $mean = $mo->mean($this, $axis);
 
         if ($mean instanceof NDArray) {
-            $shape = $mean->shape();
+            $shape = $this->shape();
+
+            $shape[$axis] = 1;
 
             if (!$keepShape) {
                 array_splice($shape, $axis, 1);
diff --git a/src/Utils/Audio.php b/src/Utils/Audio.php