Skip to content

Commit cfee9eb

Browse files
Fix audio classification window data type
1 parent 0e4b0e2 commit cfee9eb

5 files changed

Lines changed: 40 additions & 35 deletions

File tree

examples/pipelines/asr.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,13 @@
1616
$audioUrl = __DIR__ . '/../sounds/kyrian-dev.wav';
1717
$audioUrl = __DIR__ . '/../sounds/jfk.wav';
1818
$audioUrl = __DIR__ . '/../sounds/preamble.wav';
19-
$audioUrl = __DIR__ . '/../sounds/taunt.wav';
20-
$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
21-
$audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
19+
//$audioUrl = __DIR__ . '/../sounds/taunt.wav';
20+
//$audioUrl = __DIR__ . '/../sounds/gettysburg.wav';
21+
//$audioUrl = __DIR__ . '/../sounds/kyrian-speaking-30.wav';
2222
$audioUrl = __DIR__ . '/../sounds/kyrian-speaking.wav';
2323
//$audioUrl = __DIR__ . '/../sounds/dataset1.wav';
2424

2525
$streamer = StdOutStreamer::make();
26-
$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30, returnTimestamps: true);
26+
$output = $transcriber($audioUrl, maxNewTokens: 256, chunkLengthSecs: 30, returnTimestamps: 'word');
2727

2828
dd($output, timeUsage(), memoryUsage());

src/FeatureExtractors/ASTFeatureExtractor.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
class ASTFeatureExtractor extends FeatureExtractor
1313
{
1414
protected array $melFilters;
15-
protected \SplFixedArray $window;
15+
protected Tensor $window;
1616
protected mixed $mean;
1717
protected mixed $std;
1818

src/Pipelines/AutomaticSpeechRecognitionPipeline.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,6 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
228228
returnTimestamps: $returnTimestamps,
229229
forceFullSequences: $forceFullSequences
230230
);
231-
232231
$toReturn[] = ['text' => $fullText, ...$optional];
233232
}
234233

src/PretrainedTokenizers/WhisperTokenizer.php

Lines changed: 30 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,7 @@ public function decodeASR(
177177
// - Lots of complexity comes from stride and timestamps
178178

179179
$lastLanguage = null;
180+
180181
$returnWordTimestamps = $returnTimestamps === "word";
181182

182183
$newChunk = fn() => ["language" => null, "timestamp" => [null, null], "text" => ""];
@@ -275,11 +276,6 @@ public function decodeASR(
275276
} else {
276277
// This is the end of the timestamp chunk
277278
if ($roundedTime !== $chunk['timestamp'][0]) {
278-
// This is a bug in timestamp token output
279-
// where we're taking the duplicate token
280-
// as a stop where it should be a start.
281-
// This is an issue in the underlying model output
282-
// Let's just skip it so it becomes de-factor a start agin
283279
$chunk['timestamp'][1] = $roundedTime;
284280
$previousTokens[] = $currentTokens;
285281

@@ -308,6 +304,13 @@ public function decodeASR(
308304
$currentTokenTimestamps = [];
309305
$chunk = $newChunk();
310306
}
307+
else {
308+
// This is a bug in timestamp token output
309+
// where we're taking the duplicate token
310+
// as a stop where it should be a start.
311+
// This is an issue in the underlying model output
312+
// Let's just skip it so it becomes de-factor a start agin
313+
}
311314
}
312315
} else {
313316
// 4/ Regular token
@@ -419,7 +422,6 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
419422
$rightSequence = $sequences[$i];
420423
$max = 0.0;
421424
$maxIndices = [$leftLength, $leftLength, 0, 0];
422-
// dd($this->decode($leftSequence), $this->decode($rightSequence));
423425

424426
$rightLength = count($rightSequence);
425427
for ($j = 1; $j < $leftLength + $rightLength; ++$j) {
@@ -436,9 +438,8 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
436438
throw new Exception("There is a bug within whisper `decodeASR` function, please report it. Dropping to prevent bad inference.");
437439
}
438440

439-
$matches = count(array_filter(
440-
array_map(fn($elem, $idx) => $elem === $right[$idx], $left, array_keys($left))
441-
)
441+
$matches = count(
442+
array_filter($left, fn($elem, $idx) => $elem === $right[$idx], ARRAY_FILTER_USE_BOTH)
442443
);
443444

444445
$matching = $matches / $j + $eps;
@@ -494,8 +495,6 @@ public function collateWordTimestamps($tokens, $token_timestamps, $language): ar
494495
* and a list of `token_id` sequences with the tokens making up each word.
495496
* @param array $tokens
496497
* @param string|null $language
497-
* @param string $prependPunctuations
498-
* @param string $appendPunctuations
499498
* @return array
500499
* @private
501500
*/
@@ -573,34 +572,34 @@ private function decodeWithTimestamps(
573572
*/
574573
private function splitTokensOnUnicode(array $tokens): array
575574
{
576-
$decoded_full = $this->decode($tokens, decodeWithTimestamps: true);
575+
$decodedFull = $this->decode($tokens, decodeWithTimestamps: true);
577576

578-
$replacement_char = "\u{FFFD}";
577+
$replacementChar = "\u{FFFD}";
579578

580579
$words = [];
581-
$word_tokens = [];
582-
$token_indices = [];
583-
$current_tokens = [];
584-
$current_indices = [];
585-
$unicode_offset = 0;
580+
$wordTokens = [];
581+
$tokenIndices = [];
582+
$currentTokens = [];
583+
$currentIndices = [];
584+
$unicodeOffset = 0;
586585

587586
foreach ($tokens as $token_idx => $token) {
588-
$current_tokens[] = $token;
589-
$current_indices[] = $token_idx;
587+
$currentTokens[] = $token;
588+
$currentIndices[] = $token_idx;
590589

591-
$decoded = $this->decode($tokens, decodeWithTimestamps: true);
590+
$decoded = $this->decode($currentTokens, decodeWithTimestamps: true);
592591

593-
if (!str_contains($decoded, $replacement_char) || $decoded_full[$unicode_offset + strpos($decoded, $replacement_char)] === $replacement_char) {
592+
if (!str_contains($decoded, $replacementChar) || $decodedFull[$unicodeOffset + strpos($decoded, $replacementChar)] === $replacementChar) {
594593
$words[] = $decoded;
595-
$word_tokens[] = $current_tokens;
596-
$token_indices[] = $current_indices;
597-
$current_tokens = [];
598-
$current_indices = [];
599-
$unicode_offset += strlen($decoded);
594+
$wordTokens[] = $currentTokens;
595+
$tokenIndices[] = $currentIndices;
596+
$currentTokens = [];
597+
$currentIndices = [];
598+
$unicodeOffset += strlen($decoded);
600599
}
601600
}
602601

603-
return [$words, $word_tokens, $token_indices];
602+
return [$words, $wordTokens, $tokenIndices];
604603
}
605604

606605
/**
@@ -617,7 +616,9 @@ private function splitTokensOnSpaces(array $tokens): array
617616
$word_tokens = [];
618617
$token_indices = [];
619618

620-
$punctuationRegex = '/^\p{P}+$/u';
619+
// $punctuationRegex = '/^\p{P}+$/u';
620+
$punctuationRegex = '\p{P}\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E';
621+
$punctuationRegex = "/\s+|([$punctuationRegex])+/u";
621622

622623
foreach ($subwords as $i => $subword) {
623624
$subwordTokens = $subwordTokensList[$i];

src/Tensor/TensorBuffer.php

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,4 +210,9 @@ public function load(string $string): void
210210

211211
FFI::memcpy($this->data, $string, $strlen);
212212
}
213+
214+
public function __clone()
215+
{
216+
$this->data = clone $this->data;
217+
}
213218
}

0 commit comments

Comments
 (0)