Fix bug with attention mask shape and refactored Tensor for more consistent names

CodeWithKyrian · CodeWithKyrian · commit 3b2bcd608b81 · 2024-04-20T12:00:13.000+01:00
- Fixed the bug with the initial beam requiring the past key values to have a tensor with 0 in the 3rd dimension.
- Renamed methods and parameters in Tensor for a more consistent naming - dims -&gt; shape
diff --git a/examples/pipelines/text-generation.php b/examples/pipelines/text-generation.php
@@ -11,14 +11,14 @@
 
 ini_set('memory_limit', -1);
 //
+//$generator = pipeline('text-generation', 'Xenova/gpt2');
 $generator = pipeline('text-generation', 'Xenova/Qwen1.5-0.5B-Chat');
 
 $streamer = StdOutStreamer::make($generator->tokenizer);
 
 $messages = [
-    ['role' => 'user', 'content' => 'Hello!'],
-    ['role' => 'assistant', 'content' => 'Hi! How are you?'],
-    ['role' => 'user', 'content' => 'I am doing great. What about you?'],
+    ['role' => 'system', 'content' => 'You are a helpful assistant.'],
+    ['role' => 'user', 'content' => 'Who are you'],
 ];
 
 $input = $generator->tokenizer->applyChatTemplate($messages, addGenerationPrompt: true, tokenize: false);
@@ -27,9 +27,9 @@
     streamer: $streamer,
     maxNewTokens: 128,
     doSample: true,
-    temperature: 0.7,
-    repetitionPenalty: 1.3,
-    earlyStopping: true
+//    temperature: 0.7,
+//    repetitionPenalty: 1.3,
+//    earlyStopping: true
 );
 
 //$generator = pipeline('text-generation', 'Xenova/codegen-350M-mono');
diff --git a/src/Models/ModelArchitecture.php b/src/Models/ModelArchitecture.php
@@ -9,6 +9,7 @@
 use Codewithkyrian\Transformers\Models\Pretrained\PretrainedModel;
 use Codewithkyrian\Transformers\Utils\GenerationConfig;
 use Codewithkyrian\Transformers\Utils\Tensor;
+use Interop\Polite\Math\Matrix\NDArray;
 
 enum ModelArchitecture: string
 {
@@ -34,7 +35,7 @@ public function runBeam(PretrainedModel $model, array &$beam): array
     {
         return match ($this) {
             self::DecoderOnly => $this->decoderRunBeam($model, $beam),
-            self::Seq2SeqLM, self::Vision2Seq  => $this->seq2seqRunBeam($model, $beam),
+            self::Seq2SeqLM, self::Vision2Seq => $this->seq2seqRunBeam($model, $beam),
             default => throw new \Error('This model type does not support beam search'),
         };
     }
@@ -114,10 +115,11 @@ protected function decoderRunBeam(PretrainedModel $model, array &$beam): array
         // 1. Prepare
         $modelInputs = [
             'input_ids' => $beam['model_input_ids'],
-            'attention_mask' => new Tensor($attnMaskData, shape: [1, $attnMaskLength]),
+            'attention_mask' => new Tensor($attnMaskData, NDArray::int64, [1, $attnMaskLength]),
             'past_key_values' => $beam['prev_model_outputs']['past_key_values'] ?? null,
         ];
 
+
         // 2. Run
         $output = $model->forward($modelInputs);
 
@@ -155,7 +157,7 @@ protected function decoderStartBeams(
             $attnMask = null;
             if ($inputsAttentionMask !== null) {
                 $attnMask = $inputsAttentionMask[$beamId];
-                $attnMask->reshape([1, ...$attnMask->shape()]);
+                $attnMask = $attnMask->reshape([1, ...$attnMask->shape()]);
             } else {
                 $attnMask = $model->prepareAttentionMask($tokens);
             }
@@ -189,8 +191,7 @@ protected function decoderStartBeams(
     protected function decoderUpdatebeam(array &$beam, int $newTokenId): void
     {
         $beam['output_token_ids'][] = $newTokenId;
-
-        $beam['model_input_ids'] = new Tensor([$newTokenId], shape: [1, 1]);
+        $beam['model_input_ids'] = new Tensor([$newTokenId], NDArray::int64, [1, 1]);
     }
 
     /**
@@ -221,6 +222,14 @@ protected function decoderForward(PretrainedModel $model, array $modelInputs): a
         $model->preparePositionIds($inputNames, $decoderFeeds, $useCacheBranch);
         $model->addPastKeyValues($decoderFeeds, $pastKeyValues);
 
+        // The initial past key values should have a shape of 0 in one of the dimensions, which
+        // is the sequence length. However, I haven't found a way to pass a tensor with a shape of 0
+        // to the model, so I'm using a sequence length of 1 instead for the first step, and then
+        // offsetting the sequence length by 1 for the subsequent steps. This is a workaround for now.
+        $prevSequenceLength = $decoderFeeds['past_key_values.0.key']->shape()[2];
+        $attnMaskLength = $prevSequenceLength == 1 ? 1 : $prevSequenceLength + 1;
+        $decoderFeeds['attention_mask'] = Tensor::ones([1, $attnMaskLength], dtype: NDArray::int64);
+
         $decoderResults = $model->runSession($model->session, $decoderFeeds);
 
         $logits = $decoderResults['logits'];
diff --git a/src/Models/Pretrained/PretrainedModel.php b/src/Models/Pretrained/PretrainedModel.php
@@ -263,7 +263,7 @@ public static function constructSession(
     }
 
     /**
-     * @param InferenceSession $session
+     * @param array $inputNames
      * @param Tensor[] $inputs
      * @return Tensor[]
      * @throws MissingModelInputException
@@ -318,8 +318,6 @@ public function runSession(InferenceSession $session, array $inputs): array
 
             $outputNames = array_column($session->outputs(), 'name');
 
-            file_put_contents('inputs.json', json_encode($inputs));
-
             $outputs = $session->run($outputNames, $inputs);
 
             return array_combine($outputNames, array_map([Tensor::class, 'fromArray'], $outputs));
@@ -495,7 +493,6 @@ public function preparePositionIds(array $inputNames, array &$feeds, bool $useCa
         $feeds['position_ids'] = new Tensor($data, shape: $feeds['attention_mask']->shape());
 
         if ($useCacheBranch) {
-            // TODO: Fix this
             $feeds['position_ids'] = $feeds['position_ids']->slice(null, -1)->unsqueeze(-1);
         }
     }
@@ -677,8 +674,10 @@ public function addPastKeyValues(array &$decoderFeeds, ?array $pastKeyValues): v
      * @param Tensor $inputs The input token ids.
      * @param GenerationConfig|null $generationConfig The generation configuration to use. If null, default configuration will be used.
      * @param LogitsProcessorList|null $logitsProcessor An optional logits processor to use. If null, a new LogitsProcessorList instance will be created.
-     * @param array|null $inputsAttentionMask An optional attention mask for the inputs.
+     * @param Tensor|null $inputsAttentionMask An optional attention mask for the inputs.
+     * @param Streamer|null $streamer
      * @return array An array of generated output sequences, where each sequence is an array of token IDs.
+     * @throws Exception
      */
     public function generate(
         Tensor               $inputs,
@@ -793,7 +792,6 @@ public function generate(
 
                     // update new beam
                     $this->updateBeam($newBeam, $newTokenId);
-
                     $newBeam['score'] += $logProb;
 
                     if ($eosTokenIds && in_array($newTokenId, $eosTokenIds, true)) {
@@ -812,16 +810,11 @@ public function generate(
             $newestBeams = array_merge(...array_map(
                 function ($group) use ($generationConfig) {
                     usort($group, fn($a, $b) => $b['score'] <=> $a['score']);
-                    return array_slice(
-                        $group,
-                        0,
-                        $generationConfig->num_beams
-                    );
+                    return array_slice($group, 0, $generationConfig->num_beams);
                 },
                 $this->groupBeams($newestBeams)
             ));
 
-
             // Flatten beams
             $beams = $newestBeams;
 
diff --git a/src/Pipelines/TextGenerationPipeline.php b/src/Pipelines/TextGenerationPipeline.php
@@ -90,7 +90,12 @@ public function __invoke(array|string $inputs, ...$args): array
             truncation: true
         );
 
-        $outputTokenIds = $this->model->generate($inputIds, generationConfig: $generationConfig, streamer: $streamer);
+        $outputTokenIds = $this->model->generate(
+            $inputIds,
+            generationConfig: $generationConfig,
+            inputsAttentionMask: $attentionMask,
+            streamer: $streamer
+        );
 
         $decoded = $this->tokenizer->batchDecode($outputTokenIds, skipSpecialTokens: true);
 
diff --git a/src/Utils/Tensor.php b/src/Utils/Tensor.php
@@ -261,10 +261,10 @@ public static function onesLike(Tensor $other): static
      * Return a one matrix with the given dimensions.
      *
      * @param array $shape The shape of the one matrix to return.
-     * @param string|null $dtype The data type of the one matrix to return. Eg: float32, int32, etc. If null, defaults to float32.
+     * @param ?int $dtype The data type of the one matrix to return. Eg: float32, int32, etc. If null, defaults to float32.
      * @return static
      */
-    public static function ones(array $shape, ?string $dtype = null): static
+    public static function ones(array $shape, ?int $dtype = null): static
     {
         $mo = self::getMo();
 
@@ -303,7 +303,7 @@ public static function fromArray(array|NDArray $array, ?string $dtype = null, $s
     /**
      * Reshape the tensor into the given shape.
      */
-    public function reshape(array $shape): NDArray
+    public function reshape(array $shape): static
     {
         $this->assertShape($shape);
 
@@ -443,37 +443,37 @@ public function count(): int
     }
 
     /**
-     * Returns a tensor with all specified dimensions of input of size 1 removed.
+     * Returns a tensor with all specified axis of input of size 1 removed.
      *
-     * @param ?int $dim If given, the input will be squeezed only in the specified dimensions.
+     * @param ?int $axis If given, the input will be squeezed only in the specified axis.
      *
      * @return static The squeezed tensor.
      */
-    public function unsqueeze(?int $dim = null): static
+    public function unsqueeze(?int $axis = null): static
     {
         return new Tensor(
             $this->buffer(),
             $this->dtype,
-            $this->calcUnsqueezeDims($this->shape(), $dim),
+            $this->calcUnsqueezeShape($this->shape(), $axis),
             $this->offset
         );
     }
 
     /**
-     * Helper function to calculate new dimensions when performing an unsqueeze operation.
-     * @param array $dims The dimensions of the tensor.
-     * @param int $dim The dimension to unsqueeze.
-     * @return array The new dimensions.
+     * Helper function to calculate new shape when performing an unsqueeze operation.
+     * @param array $shape The shape of the tensor.
+     * @param int $axis The axis to unsqueeze.
+     * @return array The new shape.
      */
-    protected function calcUnsqueezeDims(array $dims, int $dim): array
+    protected function calcUnsqueezeShape(array $shape, int $axis): array
     {
         // Dimension out of range (e.g., "expected to be in range of [-4, 3], but got 4")
         // + 1 since we allow inserting at the end (i.e. dim = -1)
-        $dim = self::safeIndex($dim, count($dims) + 1);
-        $newDims = $dims;
-        // Insert 1 into specified dimension
-        array_splice($newDims, $dim, 0, [1]);
-        return $newDims;
+        $axis = self::safeIndex($axis, count($shape) + 1);
+
+        $shape[$axis - 1] = 1;
+
+        return $shape;
     }
 
     /**
@@ -605,11 +605,11 @@ public function normalize(int $p = 2, ?int $dim = null): static
      *
      * @param int $ord Order of the norm. Supported values are 1, 2, Infinity.
      * @param int|null $axis The axis or axes along which to perform the reduction. If null (default), reduces all dimensions.
-     * @param bool $keepdims If true, retains reduced dimensions with length 1.
+     * @param bool $keepShape If true, retains reduced shape with length 1.
      *
      * @return static
      */
-    public function norm(int $ord = 2, ?int $axis = null, bool $keepdims = false): static
+    public function norm(int $ord = 2, ?int $axis = null, bool $keepShape = false): static
     {
         $mo = self::getMo();
 
@@ -623,8 +623,8 @@ public function norm(int $ord = 2, ?int $axis = null, bool $keepdims = false): s
         $axis = $this->safeIndex($axis, $this->ndim());
 
         // Calculate the shape of the resulting array after summation
-        $resultDims = $this->shape();
-        $resultDims[$axis] = 1; // Remove the specified axis
+        $resultShape = $this->shape();
+        $resultShape[$axis] = 1; // Remove the specified axis
 
         // Create a new array to store the accumulated values
         $result = $this->zeros([count($this->buffer) / $this->shape()[$axis]]);
@@ -642,7 +642,7 @@ public function norm(int $ord = 2, ?int $axis = null, bool $keepdims = false): s
                 if ($j !== $axis) {
                     $index = $num % $size;
                     $resultIndex += $index * $resultMultiplier;
-                    $resultMultiplier *= $resultDims[$j];
+                    $resultMultiplier *= $resultShape[$j];
                 }
 
                 $num = floor($num / $size);
@@ -656,11 +656,11 @@ public function norm(int $ord = 2, ?int $axis = null, bool $keepdims = false): s
             $result = $mo->op($result, '**', 1 / $ord);
         }
 
-        if (!$keepdims) {
-            array_splice($resultDims, $axis, 1);
+        if (!$keepShape) {
+            array_splice($resultShape, $axis, 1);
         }
 
-        return new static($result->buffer(), $result->dtype(), $resultDims, $result->offset());
+        return new static($result->buffer(), $result->dtype(), $resultShape, $result->offset());
     }
 
     /**
@@ -794,7 +794,7 @@ public function to(int $dtype): static
     /**
      * Returns the mean value of each row of the tensor in the given dimension dim.
      */
-    public function mean(?int $axis = null, bool $keepdims = false): static|float|int
+    public function mean(?int $axis = null, bool $keepShape = false): static|float|int
     {
         $mo = self::getMo();
 
@@ -803,7 +803,7 @@ public function mean(?int $axis = null, bool $keepdims = false): static|float|in
         if ($mean instanceof NDArray) {
             $shape = $mean->shape();
 
-            if (!$keepdims) {
+            if (!$keepShape) {
                 array_splice($shape, $axis, 1);
             }
 
@@ -858,15 +858,15 @@ public function meanPooling(Tensor $other): Tensor
 
     public function slice(...$slices): Tensor
     {
-        $newTensorDims = [];
+        $newTensorShape = [];
         $newOffsets = [];
 
         for ($sliceIndex = 0; $sliceIndex < $this->ndim(); ++$sliceIndex) {
             $slice = $slices[$sliceIndex] ?? null;
 
             if ($slice === null) {
                 $newOffsets[] = [0, $this->shape()[$sliceIndex]];
-                $newTensorDims[] = $this->shape()[$sliceIndex];
+                $newTensorShape[] = $this->shape()[$sliceIndex];
 
             } elseif (is_int($slice)) {
                 $slice = $this->safeIndex($slice, $this->shape()[$sliceIndex], $sliceIndex);
@@ -881,31 +881,31 @@ public function slice(...$slices): Tensor
                     min($slice[1], $this->shape()[$sliceIndex])
                 ];
                 $newOffsets[] = $offsets;
-                $newTensorDims[] = $offsets[1] - $offsets[0];
+                $newTensorShape[] = $offsets[1] - $offsets[0];
 
             } else {
                 throw new Exception("Invalid slice: " . json_encode($slice));
             }
         }
 
-        $newDims = array_map(fn($offsets) => $offsets[1] - $offsets[0], $newOffsets);
+        $newShape = array_map(fn($offsets) => $offsets[1] - $offsets[0], $newOffsets);
 
-        $newBufferSize = array_reduce($newDims, fn($a, $b) => $a * $b, 1);
+        $newBufferSize = array_reduce($newShape, fn($a, $b) => $a * $b, 1);
 
         $buffer = $this->newBuffer($newBufferSize, $this->dtype());
         $stride = $this->stride();
 
         for ($i = 0; $i < $newBufferSize; ++$i) {
             $originalIndex = 0;
-            for ($j = count($newDims) - 1, $num = $i; $j >= 0; --$j) {
-                $size = $newDims[$j];
+            for ($j = count($newShape) - 1, $num = $i; $j >= 0; --$j) {
+                $size = $newShape[$j];
                 $originalIndex += (($num % $size) + $newOffsets[$j][0]) * $stride[$j];
                 $num = floor($num / $size);
             }
             $buffer[$i] = $this->buffer[$originalIndex];
         }
 
-        return new Tensor($buffer, $this->dtype(), $newDims, $this->offset());
+        return new Tensor($buffer, $this->dtype(), $newShape, $this->offset());
     }
 
     /**