Skip to content

Commit 124c7f0

Browse files
Fix some errors for whisper, and added a new copyTo method for Tensor
1 parent cfee9eb commit 124c7f0

4 files changed

Lines changed: 29 additions & 16 deletions

File tree

examples/pipelines/asr.php

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,16 @@
1414
$transcriber = pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en');
1515

1616
$audioUrl = __DIR__ . '/../sounds/kyrian-dev.wav';
17-
$audioUrl = __DIR__ . '/../sounds/jfk.wav';
18-
$audioUrl = __DIR__ . '/../sounds/preamble.wav';
17+
//$audioUrl = __DIR__ . '/../sounds/jfk.wav';
18+
//$audioUrl = __DIR__ . '/../sounds/preamble.wav';
1919
//$audioUrl = __DIR__ . '/../sounds/taunt.wav';
2020
//$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
2121
//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
22-
$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
22+
//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
23+
//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking2.wav';
2324
//$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
2425

2526
$streamer = StdOutStreamer::make();
26-
$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30, returnTimestamps: 'word');
27+
$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 18, returnTimestamps: 'word');
2728

2829
dd($output, timeUsage(), memoryUsage());

src/Models/Pretrained/WhisperForConditionalGeneration.php

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,13 @@ public function extractTokenTimestamps(
141141
$weights = Tensor::stack(array_map(function ($alignmentHead) use ($crossAttentions, $numFrames) {
142142
[$l, $h] = $alignmentHead;
143143
return $numFrames
144-
? $crossAttentions[$l]->slice(null, $h, null, [0, $numFrames])->squeeze(1)
145-
: $crossAttentions[$l]->slice(null, $h)->squeeze(1); // experimental
144+
? $crossAttentions[$l]->slice(null, $h, null, [0, $numFrames])
145+
: $crossAttentions[$l]->slice(null, $h); // experimental
146146
}, $alignmentHeads));
147147

148-
$weights = $weights->permute(1, 0, 2, 3);
148+
$weights = $weights
149+
->squeeze(1)
150+
->permute(1, 0, 2, 3);
149151

150152
[$std, $calculatedMean] = $weights->stdMean(-2, 0, true);
151153

@@ -155,22 +157,25 @@ public function extractTokenTimestamps(
155157
for ($a = 0; $a < $smoothedWeights->shape()[0]; ++$a) {
156158
$aTensor = $smoothedWeights[$a]; // [8, seqLength, 1500]
157159

160+
158161
for ($b = 0; $b < $aTensor->shape()[0]; ++$b) {
159162
$bTensor = $aTensor[$b]; // [seqLength, 1500]
160163

164+
161165
$stdTensor = $std[$a][$b][0]; // [1500]
162166
$meanTensor = $calculatedMean[$a][$b][0]; // [1500]
163167

164168
for ($c = 0; $c < $bTensor->shape()[0]; ++$c) {
165169
/** @var Tensor $cTensor */
166170
$cTensor = $bTensor[$c]; // [1500]
167-
// for ($d = 0; $d < count($cTensor->buffer()); ++$d) {
168-
// $cTensor->buffer()[$d] = ($cTensor->buffer()[$d] - $meanTensor->buffer()[$d]) / $stdTensor->buffer()[$d];
169-
// }
170-
$cTensor = $cTensor->add($meanTensor->multiply(-1))->multiply($stdTensor->reciprocal());
171+
172+
$cTensor
173+
->add($meanTensor->multiply(-1))
174+
->multiply($stdTensor->reciprocal())
175+
->copyTo($cTensor);
171176

172177
// Apply median filter.
173-
$cTensor = $this->medianFilter($cTensor, $medianFilterWidth);
178+
$this->medianFilter($cTensor, $medianFilterWidth)->copyTo($cTensor);
174179
}
175180
}
176181
}
@@ -189,7 +194,7 @@ public function extractTokenTimestamps(
189194
// NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions
190195
// as the python implementation
191196
$matrix = $batchedMatrices[$batchIdx]->multiply(-1)->squeeze(0);
192-
list($textIndices, $timeIndices) = $this->dynamicTimeWarping($matrix);
197+
[$textIndices, $timeIndices] = $this->dynamicTimeWarping($matrix);
193198

194199
$diffs = array_map(fn($i) => $textIndices[$i + 1] - $textIndices[$i], range(0, count($textIndices) - 2));
195200
$jumps = array_map(fn($x) => (bool)$x, array_merge([1], $diffs));
@@ -198,10 +203,12 @@ public function extractTokenTimestamps(
198203
for ($i = 0; $i < count($jumps); ++$i) {
199204
if ($jumps[$i]) {
200205
$jumpTimes[] = $timeIndices[$i] * $timePrecision;
201-
// NOTE: No point in rounding here, since we set to Float32Array later
202206
}
203207
}
204-
$timestamps->buffer()[$batchIdx] = array_merge([0], $jumpTimes);
208+
// $timestamps->buffer()[$batchIdx] = array_merge([0.0], $jumpTimes);
209+
for ($i = 0; $i < count($jumpTimes); ++$i) {
210+
$timestamps[$batchIdx][$i] = $jumpTimes[$i];
211+
}
205212
}
206213

207214
return $timestamps;

src/Pipelines/AutomaticSpeechRecognitionPipeline.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
160160

161161
$window = $chunkLengthSecs * $samplingRate;
162162
$stride = $strideLengthSecs * $samplingRate;
163-
$jump = $window - 2 * $stride;
163+
$jump = (int)floor($window - 2 * $stride);
164164
$offset = 0;
165165

166166

src/Tensor/Tensor.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,11 @@ public static function zerosLike(Tensor $other): static
500500
return new static($ndArray->buffer(), $ndArray->dtype(), $ndArray->shape(), $ndArray->offset());
501501
}
502502

503+
public function copyTo(Tensor $other): void
504+
{
505+
self::mo()->la()->copy($this, $other);
506+
}
507+
503508

504509
/**
505510
* Stack an array of tensors along a specified axis.

0 commit comments

Comments
 (0)