|
8 | 8 |
|
9 | 9 | /** |
10 | 10 | * BPE class for encoding text into Byte-Pair-Encoding (BPE) tokens. |
| 11 | + * |
| 12 | + * Byte-Pair Encoding (BPE) is a subword tokenization technique that iteratively merges the most frequent pair of adjacent subwords in a vocabulary. |
| 13 | + * This process continues until a desired vocabulary size is reached. |
| 14 | + * |
| 15 | + * The algorithm works as follows: |
| 16 | + * 1. Initialize the vocabulary with the most frequent characters. |
| 17 | + * 2. Iteratively merge the most frequent pair of adjacent subwords in the vocabulary. |
| 18 | + * 3. Continue until the desired vocabulary size is reached. |
11 | 19 | */ |
12 | 20 | class BPEModel extends TokenizerModel |
13 | 21 | { |
@@ -53,9 +61,24 @@ public function __construct(array $config) |
53 | 61 |
|
54 | 62 | $this->vocab = array_flip($vocab); |
55 | 63 |
|
56 | | - $this->bpeRanks = array_flip($config['merges']); |
| 64 | + // Check if using new merge format (Tokenizers >= 0.20.0) |
| 65 | + $useNewMergeFormat = is_array($config['merges'][0]); |
57 | 66 |
|
58 | | - $this->merges = array_map(fn($merge) => explode(' ', $merge), $config['merges']); |
| 67 | + if ($useNewMergeFormat) { |
| 68 | + $this->merges = $config['merges']; |
| 69 | + } else { |
| 70 | + $this->merges = array_map( |
| 71 | + fn($merge) => explode(' ', $merge, 2), |
| 72 | + $config['merges'] |
| 73 | + ); |
| 74 | + } |
| 75 | + |
| 76 | + $this->bpeRanks = []; |
| 77 | + |
| 78 | + foreach ($this->merges as $i => $pair) { |
| 79 | + $key = json_encode($pair); |
| 80 | + $this->bpeRanks[$key] = $i; |
| 81 | + } |
59 | 82 |
|
60 | 83 | $this->endOfWordSuffix = $config['end_of_word_suffix'] ?? null; |
61 | 84 | $this->continuingSubwordSuffix = $config['continuing_subword_suffix'] ?? null; |
@@ -186,15 +209,19 @@ protected function bpe(string $token): array |
186 | 209 | return $result; |
187 | 210 | } |
188 | 211 |
|
189 | | - |
| 212 | + /** |
| 213 | + * Add a node to the priority queue. |
| 214 | + * |
| 215 | + * `score` is a measure of the merge priority: lower means higher priority. |
| 216 | + * We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list) |
| 217 | + * We also add a fractional component to the score to break ties (with the earlier character having higher priority) |
| 218 | + */ |
190 | 219 | public function addNodeToQueue(SplPriorityQueue $queue, BPENode $node): void |
191 | 220 | { |
192 | | - // `score` is a measure of the merge priority: lower means higher priority. |
193 | | - // We use the BPE rank as a measure of priority (i.e., the local of the merge in the merges list) |
194 | | - // We also add a fractional component to the score to break ties (with the earlier character having higher priority) |
195 | | - $rank = $this->bpeRanks[$node->token . self::BPE_SPLIT_TOKEN . $node->next?->token] ?? null; |
| 221 | + $pairKey = json_encode([$node->token, $node->next->token]); |
| 222 | + $rank = $this->bpeRanks[$pairKey] ?? null; |
196 | 223 |
|
197 | | - if ($rank !== null) { |
| 224 | + if ($rank) { |
198 | 225 | $node->score = - ($rank + $node->bias); |
199 | 226 | $queue->insert($node, $node->score); |
200 | 227 | } |
|
0 commit comments