mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 08:45:38 +00:00
Decode stream python (#1678)
* Python binding for decode stream Different API because Python cannot handle lifetimes properly. * Clippy.
This commit is contained in:
@ -9,6 +9,7 @@ from tokenizers.models import BPE, Model, Unigram
|
||||
from tokenizers.pre_tokenizers import ByteLevel, Metaspace
|
||||
from tokenizers.processors import RobertaProcessing, TemplateProcessing
|
||||
from tokenizers.normalizers import Strip, Lowercase, Sequence
|
||||
from tokenizers.decoders import ByteFallback, DecodeStream, Metaspace as DecoderMetaspace
|
||||
|
||||
|
||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
|
||||
@ -365,6 +366,37 @@ class TestTokenizer:
|
||||
output = tokenizer.decode_batch([[0, 1, 2, 3], [4]])
|
||||
assert output == ["my name is john", "pair"]
|
||||
|
||||
# Can decode stream
|
||||
stream = DecodeStream(skip_special_tokens=False)
|
||||
assert stream.step(tokenizer, 0) == "my"
|
||||
assert stream.step(tokenizer, 1) == " name"
|
||||
assert stream.step(tokenizer, 2) == " is"
|
||||
assert stream.step(tokenizer, 3) == " john"
|
||||
|
||||
def test_decode_stream(self):
|
||||
vocab = [
|
||||
("<unk>", 0.0),
|
||||
("<0x20>", -0.1),
|
||||
("<0xC3>", -0.2),
|
||||
("<0xA9>", -0.3),
|
||||
]
|
||||
tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=True))
|
||||
tokenizer.decoder = ByteFallback()
|
||||
stream = DecodeStream(skip_special_tokens=False)
|
||||
assert stream.step(tokenizer, 1) == " "
|
||||
assert stream.step(tokenizer, 2) == None
|
||||
assert stream.step(tokenizer, 3) == "é"
|
||||
|
||||
vocab = [
|
||||
("<unk>", 0.0),
|
||||
("▁This", -0.1),
|
||||
]
|
||||
tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=False))
|
||||
tokenizer.decoder = DecoderMetaspace()
|
||||
stream = DecodeStream(skip_special_tokens=False)
|
||||
assert stream.step(tokenizer, 1) == "This"
|
||||
assert stream.step(tokenizer, 1) == " This"
|
||||
|
||||
def test_get_vocab(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
|
Reference in New Issue
Block a user