1111use Codewithkyrian \Transformers \Utils \GenerationConfig ;
1212use Codewithkyrian \Transformers \Utils \Image ;
1313use function Codewithkyrian \Transformers \Utils \array_pop_key ;
14- use function Codewithkyrian \Transformers \Utils \array_to_snake_case ;
14+ use function Codewithkyrian \Transformers \Utils \array_keys_to_snake_case ;
1515
1616/**
1717 * Pipeline that aims at extracting spoken text contained within some audio.
@@ -111,11 +111,13 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
111111 $ task = array_pop_key ($ args , 'task ' );
112112 $ streamer = array_pop_key ($ args , 'streamer ' );
113113
114- if (!is_null ($ streamer ) && !is_a ($ streamer , WhisperTextStreamer::class)) {
115- throw new \InvalidArgumentException ('`streamer` must be an instance of `WhisperTextStreamer` ' );
116- }
114+ // if (!is_null($streamer) && !is_a($streamer, WhisperTextStreamer::class)) {
115+ // throw new \InvalidArgumentException('`streamer` must be an instance of `WhisperTextStreamer`');
116+ // }
117+
118+ if (!is_null ($ streamer )) trigger_error ('`streamer` is not supported yet for Whisper ' , E_USER_WARNING );
117119
118- $ kwargs = array_to_snake_case ($ args );
120+ $ kwargs = array_keys_to_snake_case ($ args );
119121
120122 $ generationConfig = new GenerationConfig ($ kwargs );
121123
@@ -139,14 +141,12 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
139141 $ timePrecision = $ this ->processor ->featureExtractor ->config ['chunk_length ' ] / $ this ->model ->config ['max_source_positions ' ];
140142 $ hopLength = $ this ->processor ->featureExtractor ->config ['hop_length ' ];
141143 $ samplingRate = $ this ->processor ->featureExtractor ->config ['sampling_rate ' ];
142- $ timestampBegin = $ this ->tokenizer ->tokenizer ->convertTokensToIds (["<|notimestamps|> " ])[0 ] + 1 ;
143144
144145 $ toReturn = [];
145146
146- $ streamer ?->setTokenizer($ this ->tokenizer )
147- ?->shouldSkipPrompt(false )
148- ?->setTimePrecision($ timePrecision )
149- ?->setTimestampBegin($ timestampBegin );
147+ // $streamer?->setTokenizer($this->tokenizer)
148+ // ?->setTimePrecision($timePrecision)
149+ // ?->setTimestampBegin($timestampBegin);
150150
151151 foreach ($ inputs as $ input ) {
152152 $ audio = Audio::read ($ input );
@@ -168,9 +168,9 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
168168 $ offset = 0 ;
169169
170170 while ($ offset < $ audioTensor ->size ()) {
171-
172171 if ($ offset + $ window > $ audioTensor ->size ()) {
173172 $ window = $ audioTensor ->size () - $ offset ;
173+ $ jump = $ window ;
174174 }
175175
176176 $ subAudio = $ audioTensor ->sliceWithBounds ([$ offset ], [$ window ]);
@@ -206,7 +206,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
206206 foreach ($ chunks as &$ chunk ) {
207207 $ generationConfig ['num_frames ' ] = (int )floor ($ chunk ['stride ' ][0 ] / $ hopLength );
208208
209- $ data = $ this ->model ->generate ($ chunk ['input_features ' ], generationConfig: $ generationConfig, streamer: $ streamer );
209+ $ data = $ this ->model ->generate ($ chunk ['input_features ' ], generationConfig: $ generationConfig );
210210
211211 // TODO: Right now we only get top beam
212212 if ($ returnTimestamps === 'word ' ) {
@@ -219,7 +219,7 @@ private function __invokeWhisper(array|string $inputs, ...$args): array|Tensor|I
219219 // convert stride to seconds
220220 $ chunk ['stride ' ] = array_map (fn ($ x ) => $ x / $ samplingRate , $ chunk ['stride ' ]);
221221
222- $ streamer ?->notifyChunkEnd($ chunk ['stride ' ][0 ]);
222+ // $streamer?->notifyChunkEnd($chunk['stride'][0]);
223223 }
224224
225225 // Merge text chunks
0 commit comments