@@ -177,6 +177,7 @@ public function decodeASR(
177177 // - Lots of complexity comes from stride and timestamps
178178
179179 $ lastLanguage = null ;
180+
180181 $ returnWordTimestamps = $ returnTimestamps === "word " ;
181182
182183 $ newChunk = fn () => ["language " => null , "timestamp " => [null , null ], "text " => "" ];
@@ -275,11 +276,6 @@ public function decodeASR(
275276 } else {
276277 // This is the end of the timestamp chunk
277278 if ($ roundedTime !== $ chunk ['timestamp ' ][0 ]) {
278- // This is a bug in timestamp token output
279- // where we're taking the duplicate token
280- // as a stop where it should be a start.
281- // This is an issue in the underlying model output
282- // Let's just skip it so it becomes de-factor a start agin
283279 $ chunk ['timestamp ' ][1 ] = $ roundedTime ;
284280 $ previousTokens [] = $ currentTokens ;
285281
@@ -308,6 +304,13 @@ public function decodeASR(
308304 $ currentTokenTimestamps = [];
309305 $ chunk = $ newChunk ();
310306 }
307+ else {
308+ // This is a bug in timestamp token output
309+ // where we're taking the duplicate token
310+ // as a stop where it should be a start.
311+ // This is an issue in the underlying model output
312+ // Let's just skip it so it becomes de-factor a start agin
313+ }
311314 }
312315 } else {
313316 // 4/ Regular token
@@ -419,7 +422,6 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
419422 $ rightSequence = $ sequences [$ i ];
420423 $ max = 0.0 ;
421424 $ maxIndices = [$ leftLength , $ leftLength , 0 , 0 ];
422- // dd($this->decode($leftSequence), $this->decode($rightSequence));
423425
424426 $ rightLength = count ($ rightSequence );
425427 for ($ j = 1 ; $ j < $ leftLength + $ rightLength ; ++$ j ) {
@@ -436,9 +438,8 @@ private function findLongestCommonSequence(array $sequences, array $tokenTimesta
436438 throw new Exception ("There is a bug within whisper `decodeASR` function, please report it. Dropping to prevent bad inference. " );
437439 }
438440
439- $ matches = count (array_filter (
440- array_map (fn ($ elem , $ idx ) => $ elem === $ right [$ idx ], $ left , array_keys ($ left ))
441- )
441+ $ matches = count (
442+ array_filter ($ left , fn ($ elem , $ idx ) => $ elem === $ right [$ idx ], ARRAY_FILTER_USE_BOTH )
442443 );
443444
444445 $ matching = $ matches / $ j + $ eps ;
@@ -494,8 +495,6 @@ public function collateWordTimestamps($tokens, $token_timestamps, $language): ar
494495 * and a list of `token_id` sequences with the tokens making up each word.
495496 * @param array $tokens
496497 * @param string|null $language
497- * @param string $prependPunctuations
498- * @param string $appendPunctuations
499498 * @return array
500499 * @private
501500 */
@@ -573,34 +572,34 @@ private function decodeWithTimestamps(
573572 */
574573 private function splitTokensOnUnicode (array $ tokens ): array
575574 {
576- $ decoded_full = $ this ->decode ($ tokens , decodeWithTimestamps: true );
575+ $ decodedFull = $ this ->decode ($ tokens , decodeWithTimestamps: true );
577576
578- $ replacement_char = "\u{FFFD}" ;
577+ $ replacementChar = "\u{FFFD}" ;
579578
580579 $ words = [];
581- $ word_tokens = [];
582- $ token_indices = [];
583- $ current_tokens = [];
584- $ current_indices = [];
585- $ unicode_offset = 0 ;
580+ $ wordTokens = [];
581+ $ tokenIndices = [];
582+ $ currentTokens = [];
583+ $ currentIndices = [];
584+ $ unicodeOffset = 0 ;
586585
587586 foreach ($ tokens as $ token_idx => $ token ) {
588- $ current_tokens [] = $ token ;
589- $ current_indices [] = $ token_idx ;
587+ $ currentTokens [] = $ token ;
588+ $ currentIndices [] = $ token_idx ;
590589
591- $ decoded = $ this ->decode ($ tokens , decodeWithTimestamps: true );
590+ $ decoded = $ this ->decode ($ currentTokens , decodeWithTimestamps: true );
592591
593- if (!str_contains ($ decoded , $ replacement_char ) || $ decoded_full [ $ unicode_offset + strpos ($ decoded , $ replacement_char )] === $ replacement_char ) {
592+ if (!str_contains ($ decoded , $ replacementChar ) || $ decodedFull [ $ unicodeOffset + strpos ($ decoded , $ replacementChar )] === $ replacementChar ) {
594593 $ words [] = $ decoded ;
595- $ word_tokens [] = $ current_tokens ;
596- $ token_indices [] = $ current_indices ;
597- $ current_tokens = [];
598- $ current_indices = [];
599- $ unicode_offset += strlen ($ decoded );
594+ $ wordTokens [] = $ currentTokens ;
595+ $ tokenIndices [] = $ currentIndices ;
596+ $ currentTokens = [];
597+ $ currentIndices = [];
598+ $ unicodeOffset += strlen ($ decoded );
600599 }
601600 }
602601
603- return [$ words , $ word_tokens , $ token_indices ];
602+ return [$ words , $ wordTokens , $ tokenIndices ];
604603 }
605604
606605 /**
@@ -617,7 +616,9 @@ private function splitTokensOnSpaces(array $tokens): array
617616 $ word_tokens = [];
618617 $ token_indices = [];
619618
620- $ punctuationRegex = '/^\p{P}+$/u ' ;
619+ // $punctuationRegex = '/^\p{P}+$/u';
620+ $ punctuationRegex = '\p{P}\x21-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E ' ;
621+ $ punctuationRegex = "/\s+|([ $ punctuationRegex])+/u " ;
621622
622623 foreach ($ subwords as $ i => $ subword ) {
623624 $ subwordTokens = $ subwordTokensList [$ i ];
0 commit comments