@@ -8,6 +8,11 @@ use regex_automata::{
88 Anchored , Input ,
99} ;
1010
11+ pub mod normalizer;
12+
13+ pub use bpe:: * ;
14+ pub use normalizer:: { Normalizable , NormalizedString } ;
15+
1116// Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
1217// The look-ahead character is dropped from the match by the Pretokenizer iterator.
1318// Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
@@ -18,7 +23,7 @@ static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
1823 let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}{1,3}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+$" ;
1924 let pat2 = "\\ s+\\ s" ;
2025 let pat3 = "\\ s+" ;
21- Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
26+ Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] , false )
2227 . expect ( "valid regex" )
2328} ) ;
2429
@@ -35,11 +40,19 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
3540 ] . join ( "|" ) ;
3641 let pat2 = "\\ s+\\ s" ;
3742 let pat3 = "\\ s+" ;
38- Tokenizer :: new_lookahead ( bpe, & [ ( & pat1, false ) , ( pat2, true ) , ( pat3, false ) ] )
43+ Tokenizer :: new_lookahead ( bpe, & [ ( & pat1, false ) , ( pat2, true ) , ( pat3, false ) ] , false )
3944 . expect ( "valid regex" )
4045} ) ;
4146
42- pub use bpe:: * ;
47+ static BPE_VOYAGE3_BASE : LazyLock < Tokenizer > = LazyLock :: new ( || {
48+ let bytes = include_bytes ! ( concat!( env!( "OUT_DIR" ) , "/bpe_voyage3_base.dict" ) ) ;
49+ let bpe = rmp_serde:: from_slice ( bytes) . expect ( "valid bpe data" ) ;
50+ let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\ r\\ n\\ p{L}\\ p{N}]?\\ p{L}+|\\ p{N}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n]*|\\ s*[\\ r\\ n]+|\\ s+$" ;
51+ let pat2 = "\\ s+\\ s" ;
52+ let pat3 = "\\ s+" ;
53+ Tokenizer :: new_lookahead ( bpe, & [ ( pat1, false ) , ( pat2, true ) , ( pat3, false ) ] , true )
54+ . expect ( "valid regex" )
55+ } ) ;
4356
4457/// A byte-pair encoding tokenizer that supports a pre-tokenization regex.
4558/// The direct methods on this type pre-tokenize the input text and should
@@ -52,6 +65,8 @@ pub struct Tokenizer {
5265 pub bpe : BytePairEncoding ,
5366 /// The pattern regex used to split the input.
5467 pub pre : Option < Pretokenizer > ,
68+ /// Indicates whether the input should be normalized with NFC.
69+ nfc : bool ,
5570}
5671
5772pub struct Pretokenizer {
@@ -64,9 +79,9 @@ pub struct Pretokenizer {
6479impl Tokenizer {
6580 /// Build a tokenizer with an optional pretokenization regex pattern.
6681 #[ allow( clippy:: result_large_err) ]
67- pub fn new ( bpe : BytePairEncoding , pat : Option < & str > ) -> Result < Self , BuildError > {
82+ pub fn new ( bpe : BytePairEncoding , pat : Option < & str > , nfc : bool ) -> Result < Self , BuildError > {
6883 let pre = pat. map ( Pretokenizer :: new) . transpose ( ) ?;
69- Ok ( Self { bpe, pre } )
84+ Ok ( Self { nfc , bpe, pre } )
7085 }
7186
7287 /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
@@ -75,34 +90,41 @@ impl Tokenizer {
7590 pub fn new_lookahead (
7691 bpe : BytePairEncoding ,
7792 patterns : & [ ( & str , bool ) ] ,
93+ nfc : bool ,
7894 ) -> Result < Self , BuildError > {
7995 let pre = Some ( Pretokenizer :: new_lookahead ( patterns) ?) ;
80- Ok ( Self { bpe, pre } )
96+ Ok ( Self { nfc , bpe, pre } )
8197 }
8298
8399 /// Count the number of tokens produced when encoding the text. Applies pre-tokenization
84100 /// before counting.
85- pub fn count ( & self , text : & str ) -> usize {
86- self . split ( text)
101+ pub fn count < ' a , I : Normalizable < ' a > > ( & self , text : I ) -> usize {
102+ let text = self . normalize ( text) ;
103+ self . split ( text. as_str ( ) )
87104 . map ( |piece| self . bpe . count ( piece. as_bytes ( ) ) )
88105 . sum ( )
89106 }
90107
91108 /// Returns the token count iff the total token count stays below the specified token_limit.
92109 /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
93110 /// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
94- pub fn count_till_limit ( & self , text : & str , token_limit : usize ) -> Option < usize > {
95- self . split ( text) . try_fold ( 0 , |consumed, piece| {
111+ ///
112+ /// Note: This function assumes that the text is already normalized, so that this function can run
113+ /// in roughly O(token_limit) time.
114+ pub fn count_till_limit ( & self , text : & NormalizedString , token_limit : usize ) -> Option < usize > {
115+ let res: Option < usize > = self . split ( text. as_str ( ) ) . try_fold ( 0 , |consumed, piece| {
96116 self . bpe
97117 . count_till_limit ( piece. as_bytes ( ) , token_limit - consumed)
98118 . map ( |piece_count| consumed + piece_count)
99- } )
119+ } ) ;
120+ res
100121 }
101122
102123 /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
103124 /// encoding.
104- pub fn encode ( & self , text : & str ) -> Vec < u32 > {
105- self . split ( text)
125+ pub fn encode < ' a , I : Normalizable < ' a > > ( & self , text : I ) -> Vec < u32 > {
126+ let text: NormalizedString < ' _ > = self . normalize ( text) ;
127+ self . split ( text. as_str ( ) )
106128 . flat_map ( |piece| self . bpe . encode_via_backtracking ( piece. as_bytes ( ) ) )
107129 . collect ( )
108130 }
@@ -114,12 +136,18 @@ impl Tokenizer {
114136
115137 /// Returns an iterator with the text pieces resulting from pre-tokenization. If this
116138 /// tokenizer does not have pre-tokenization, the iterator returns the full text.
117- pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & ' a str > + ' a {
139+ pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & ' a str > {
118140 match & self . pre {
119141 Some ( pre) => Either :: Left ( pre. split ( text) ) ,
120142 None => Either :: Right ( std:: iter:: once ( text) ) ,
121143 }
122144 }
145+
146+ /// Returns the normalized text if the tokenizer requires normalization.
147+ /// If the input was already normalized, this function is a noop.
148+ pub fn normalize < ' a , I : Normalizable < ' a > > ( & self , text : I ) -> NormalizedString < ' a > {
149+ text. normalize ( self . nfc )
150+ }
123151}
124152
125153impl Pretokenizer {
@@ -143,7 +171,7 @@ impl Pretokenizer {
143171 }
144172
145173 /// Returns an iterator with the text pieces after splitting with the regular expression.
146- pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & ' a str > + ' a {
174+ pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & ' a str > {
147175 Splits {
148176 pat : & self . pat ,
149177 lookahead : & self . lookahead ,
@@ -201,6 +229,10 @@ pub fn o200k_base() -> &'static Tokenizer {
201229 & BPE_O200K_BASE
202230}
203231
232+ pub fn voyage3_base ( ) -> & ' static Tokenizer {
233+ & BPE_VOYAGE3_BASE
234+ }
235+
204236#[ cfg( test) ]
205237mod tests {
206238 use bpe:: byte_pair_encoding:: { create_test_string, select_test_string} ;
@@ -233,9 +265,21 @@ mod tests {
233265
234266 #[ test]
235267 fn test_count_till_limit ( ) {
236- assert_eq ! ( cl100k_base( ) . count_till_limit( "abc" , 3 ) , Some ( 1 ) ) ;
237- assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabc" , 3 ) , Some ( 2 ) ) ;
238- assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabcabc" , 3 ) , Some ( 3 ) ) ;
239- assert_eq ! ( cl100k_base( ) . count_till_limit( "abcabcabcabc" , 3 ) , None ) ;
268+ assert_eq ! (
269+ cl100k_base( ) . count_till_limit( & cl100k_base( ) . normalize( "abc" ) , 3 ) ,
270+ Some ( 1 )
271+ ) ;
272+ assert_eq ! (
273+ cl100k_base( ) . count_till_limit( & cl100k_base( ) . normalize( "abcabc" ) , 3 ) ,
274+ Some ( 2 )
275+ ) ;
276+ assert_eq ! (
277+ cl100k_base( ) . count_till_limit( & cl100k_base( ) . normalize( "abcabcabc" ) , 3 ) ,
278+ Some ( 3 )
279+ ) ;
280+ assert_eq ! (
281+ cl100k_base( ) . count_till_limit( & cl100k_base( ) . normalize( "abcabcabcabc" ) , 3 ) ,
282+ None
283+ ) ;
240284 }
241285}
0 commit comments