Skip to content

Commit 970b529

Browse files
authored
Fix whitespace replacement bug (#24)
1 parent 9376d50 commit 970b529

2 files changed

Lines changed: 11 additions & 2 deletions

File tree

jetstream/engine/token_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ def mix_decode(vocab: Vocabulary, tok_id: int):
3636
utilize IdToPiece to convert it into a string, likely with a space placeholder (' ') for the corresponding tokens.
3737
"""
3838
p_token = vocab.tokenizer.IdToPiece(tok_id)
39-
p_token = p_token.replace('▁', ' ').replace('_', ' ')
39+
# SentencePiece escapes the whitespace with a meta symbol "▁" (U+2581)
40+
p_token = p_token.replace('▁', ' ')
4041
d_token = vocab.tokenizer.decode([tok_id])
4142
return p_token if p_token.lstrip() == d_token else d_token
4243

jetstream/tests/engine/test_token_utils.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def __init__(self, tokenizer_path: str):
2828

2929
def decode(self, t: int) -> str:
3030
token = self.vocab.tokenizer.IdToPiece(t)
31-
token = token.replace('▁', ' ').replace('_', ' ')
31+
token = token.replace('▁', ' ')
3232
return token
3333

3434

@@ -77,6 +77,14 @@ def test_sp_vs_seqio(self):
7777
seqio_t = self.jt_tokenizer.vocab.tokenizer.decode([n])
7878
self.assertEqual(sp_t, seqio_t)
7979

80+
def test_underscore_in_output(self):
81+
self.setup()
82+
n = 21326
83+
mix_output = token_utils.mix_decode(vocab = self.jt_tokenizer.vocab, tok_id = n)
84+
decode_output = self.sp_tokenizer.decode([n])
85+
self.assertEqual(mix_output, " `__")
86+
self.assertEqual(mix_output.lstrip(), decode_output)
87+
8088

8189
if __name__ == '__main__':
8290
unittest.main()

0 commit comments

Comments
 (0)