Skip to content

Commit 7f48433

Browse files
feat: enhance BPEModel to support new merge format and improve ranks mapping
- Added support for the new merge format in BPEModel, allowing for direct usage of merges as arrays. - Improved the creation of the ranks map by using JSON encoded pairs as keys for better compatibility and performance.
1 parent 894bc90 commit 7f48433

3 files changed

Lines changed: 36 additions & 10 deletions

File tree

bin/transformers

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ $application = new Application();
1212
try {
1313
$application->setName('Transformers PHP CLI');
1414

15-
$application->add(new Codewithkyrian\Transformers\Commands\DownloadModelCommand());
15+
$application->addCommand(new Codewithkyrian\Transformers\Commands\DownloadModelCommand());
1616

1717
$application->run();
1818
} catch (Exception $e) {

src/PreTrainedTokenizers/PreTrainedTokenizer.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -653,7 +653,6 @@ public function applyChatTemplate(
653653
bool $returnTensor = true
654654
): string|array {
655655
$chatTemplate ??= $this->chatTemplate ?? $this->getDefaultChatTemplate();
656-
dd($chatTemplate);
657656

658657
// Compilation function uses a cache to avoid recompiling the same template
659658
$compiledTemplate = $this->compiledTemplateCache[$chatTemplate] ?? null;

src/Tokenizers/BPEModel.php

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,14 @@
88

99
/**
1010
* BPE class for encoding text into Byte-Pair-Encoding (BPE) tokens.
11+
*
12+
* Byte-Pair Encoding (BPE) is a subword tokenization technique that iteratively merges the most frequent pair of adjacent subwords in a vocabulary.
13+
* This process continues until a desired vocabulary size is reached.
14+
*
15+
* The algorithm works as follows:
16+
* 1. Initialize the vocabulary with the most frequent characters.
17+
* 2. Iteratively merge the most frequent pair of adjacent subwords in the vocabulary.
18+
* 3. Continue until the desired vocabulary size is reached.
1119
*/
1220
class BPEModel extends TokenizerModel
1321
{
@@ -53,9 +61,24 @@ public function __construct(array $config)
5361

5462
$this->vocab = array_flip($vocab);
5563

56-
$this->bpeRanks = array_flip($config['merges']);
64+
// Check if using new merge format (Tokenizers >= 0.20.0)
65+
$useNewMergeFormat = is_array($config['merges'][0]);
5766

58-
$this->merges = array_map(fn($merge) => explode(' ', $merge), $config['merges']);
67+
if ($useNewMergeFormat) {
68+
$this->merges = $config['merges'];
69+
} else {
70+
$this->merges = array_map(
71+
fn($merge) => explode(' ', $merge, 2),
72+
$config['merges']
73+
);
74+
}
75+
76+
$this->bpeRanks = [];
77+
78+
foreach ($this->merges as $i => $pair) {
79+
$key = json_encode($pair);
80+
$this->bpeRanks[$key] = $i;
81+
}
5982

6083
$this->endOfWordSuffix = $config['end_of_word_suffix'] ?? null;
6184
$this->continuingSubwordSuffix = $config['continuing_subword_suffix'] ?? null;
@@ -186,15 +209,19 @@ protected function bpe(string $token): array
186209
return $result;
187210
}
188211

189-
212+
/**
213+
* Add a node to the priority queue.
214+
*
215+
* `score` is a measure of the merge priority: lower means higher priority.
216+
* We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list)
217+
* We also add a fractional component to the score to break ties (with the earlier character having higher priority)
218+
*/
190219
public function addNodeToQueue(SplPriorityQueue $queue, BPENode $node): void
191220
{
192-
// `score` is a measure of the merge priority: lower means higher priority.
193-
// We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list)
194-
// We also add a fractional component to the score to break ties (with the earlier character having higher priority)
195-
$rank = $this->bpeRanks[$node->token . self::BPE_SPLIT_TOKEN . $node->next?->token] ?? null;
221+
$pairKey = json_encode([$node->token, $node->next->token]);
222+
$rank = $this->bpeRanks[$pairKey] ?? null;
196223

197-
if ($rank !== null) {
224+
if ($rank) {
198225
$node->score = - ($rank + $node->bias);
199226
$queue->insert($node, $node->score);
200227
}

0 commit comments

Comments
 (0)