@@ -141,11 +141,13 @@ public function extractTokenTimestamps(
141141 $ weights = Tensor::stack (array_map (function ($ alignmentHead ) use ($ crossAttentions , $ numFrames ) {
142142 [$ l , $ h ] = $ alignmentHead ;
143143 return $ numFrames
144- ? $ crossAttentions [$ l ]->slice (null , $ h , null , [0 , $ numFrames ])-> squeeze ( 1 )
145- : $ crossAttentions [$ l ]->slice (null , $ h )-> squeeze ( 1 ) ; // experimental
144+ ? $ crossAttentions [$ l ]->slice (null , $ h , null , [0 , $ numFrames ])
145+ : $ crossAttentions [$ l ]->slice (null , $ h ); // experimental
146146 }, $ alignmentHeads ));
147147
148- $ weights = $ weights ->permute (1 , 0 , 2 , 3 );
148+ $ weights = $ weights
149+ ->squeeze (1 )
150+ ->permute (1 , 0 , 2 , 3 );
149151
150152 [$ std , $ calculatedMean ] = $ weights ->stdMean (-2 , 0 , true );
151153
@@ -155,22 +157,25 @@ public function extractTokenTimestamps(
155157 for ($ a = 0 ; $ a < $ smoothedWeights ->shape ()[0 ]; ++$ a ) {
156158 $ aTensor = $ smoothedWeights [$ a ]; // [8, seqLength, 1500]
157159
160+
158161 for ($ b = 0 ; $ b < $ aTensor ->shape ()[0 ]; ++$ b ) {
159162 $ bTensor = $ aTensor [$ b ]; // [seqLength, 1500]
160163
164+
161165 $ stdTensor = $ std [$ a ][$ b ][0 ]; // [1500]
162166 $ meanTensor = $ calculatedMean [$ a ][$ b ][0 ]; // [1500]
163167
164168 for ($ c = 0 ; $ c < $ bTensor ->shape ()[0 ]; ++$ c ) {
165169 /** @var Tensor $cTensor */
166170 $ cTensor = $ bTensor [$ c ]; // [1500]
167- // for ($d = 0; $d < count($cTensor->buffer()); ++$d) {
168- // $cTensor->buffer()[$d] = ($cTensor->buffer()[$d] - $meanTensor->buffer()[$d]) / $stdTensor->buffer()[$d];
169- // }
170- $ cTensor = $ cTensor ->add ($ meanTensor ->multiply (-1 ))->multiply ($ stdTensor ->reciprocal ());
171+
172+ $ cTensor
173+ ->add ($ meanTensor ->multiply (-1 ))
174+ ->multiply ($ stdTensor ->reciprocal ())
175+ ->copyTo ($ cTensor );
171176
172177 // Apply median filter.
173- $ cTensor = $ this ->medianFilter ($ cTensor , $ medianFilterWidth );
178+ $ this ->medianFilter ($ cTensor , $ medianFilterWidth)-> copyTo ( $ cTensor );
174179 }
175180 }
176181 }
@@ -189,7 +194,7 @@ public function extractTokenTimestamps(
189194 // NOTE: Since we run only one batch at a time, we can squeeze to get the same dimensions
190195 // as the python implementation
191196 $ matrix = $ batchedMatrices [$ batchIdx ]->multiply (-1 )->squeeze (0 );
192- list ( $ textIndices , $ timeIndices) = $ this ->dynamicTimeWarping ($ matrix );
197+ [ $ textIndices , $ timeIndices] = $ this ->dynamicTimeWarping ($ matrix );
193198
194199 $ diffs = array_map (fn ($ i ) => $ textIndices [$ i + 1 ] - $ textIndices [$ i ], range (0 , count ($ textIndices ) - 2 ));
195200 $ jumps = array_map (fn ($ x ) => (bool )$ x , array_merge ([1 ], $ diffs ));
@@ -198,10 +203,12 @@ public function extractTokenTimestamps(
198203 for ($ i = 0 ; $ i < count ($ jumps ); ++$ i ) {
199204 if ($ jumps [$ i ]) {
200205 $ jumpTimes [] = $ timeIndices [$ i ] * $ timePrecision ;
201- // NOTE: No point in rounding here, since we set to Float32Array later
202206 }
203207 }
204- $ timestamps ->buffer ()[$ batchIdx ] = array_merge ([0 ], $ jumpTimes );
208+ // $timestamps->buffer()[$batchIdx] = array_merge([0.0], $jumpTimes);
209+ for ($ i = 0 ; $ i < count ($ jumpTimes ); ++$ i ) {
210+ $ timestamps [$ batchIdx ][$ i ] = $ jumpTimes [$ i ];
211+ }
205212 }
206213
207214 return $ timestamps ;
0 commit comments