mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Using serde (serde_pyo3) to get __str__ and __repr__ easily. (#1588)
* Using serde (serde_pyo3) to get __str__ and __repr__ easily. * Putting it within tokenizers, it needs to be too specific. * Clippy is our friend. * Ruff. * Update the tests. * Pretty sure this is wrong (#1589) * Adding support for ellipsis. * Fmt. * Ruff. * Fixing tokenizer. --------- Co-authored-by: Eric Buehler <65165915+EricLBuehler@users.noreply.github.com>
This commit is contained in:
@ -7,7 +7,8 @@ from tokenizers import AddedToken, Encoding, Tokenizer
|
||||
from tokenizers.implementations import BertWordPieceTokenizer
|
||||
from tokenizers.models import BPE, Model, Unigram
|
||||
from tokenizers.pre_tokenizers import ByteLevel
|
||||
from tokenizers.processors import RobertaProcessing
|
||||
from tokenizers.processors import RobertaProcessing, TemplateProcessing
|
||||
from tokenizers.normalizers import Strip, Lowercase, Sequence
|
||||
|
||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
|
||||
|
||||
@ -549,3 +550,28 @@ class TestTokenizer:
|
||||
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True)
|
||||
assert output == "name is john"
|
||||
assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)
|
||||
|
||||
|
||||
class TestTokenizerRepr:
|
||||
def test_repr(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
out = repr(tokenizer)
|
||||
assert (
|
||||
out
|
||||
== 'Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))'
|
||||
)
|
||||
|
||||
def test_repr_complete(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
|
||||
tokenizer.post_processor = TemplateProcessing(
|
||||
single=["[CLS]", "$0", "[SEP]"],
|
||||
pair=["[CLS]:0", "$A", "[SEP]:0", "$B:1", "[SEP]:1"],
|
||||
special_tokens=[("[CLS]", 1), ("[SEP]", 0)],
|
||||
)
|
||||
tokenizer.normalizer = Sequence([Lowercase(), Strip()])
|
||||
out = repr(tokenizer)
|
||||
assert (
|
||||
out
|
||||
== 'Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=Sequence(normalizers=[Lowercase(), Strip(strip_left=True, strip_right=True)]), pre_tokenizer=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), post_processor=TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0), Sequence(id=B, type_id=1), SpecialToken(id="[SEP]", type_id=1)], special_tokens={"[CLS]":SpecialToken(id="[CLS]", ids=[1], tokens=["[CLS]"]), "[SEP]":SpecialToken(id="[SEP]", ids=[0], tokens=["[SEP]"])}), decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))'
|
||||
)
|
||||
|
@ -5,6 +5,7 @@ import unittest
|
||||
import tqdm
|
||||
from huggingface_hub import hf_hub_download
|
||||
from tokenizers import Tokenizer
|
||||
from tokenizers.models import BPE, Unigram
|
||||
|
||||
from .utils import albert_base, data_dir
|
||||
|
||||
@ -16,6 +17,73 @@ class TestSerialization:
|
||||
# file exceeds the buffer capacity
|
||||
Tokenizer.from_file(albert_base)
|
||||
|
||||
def test_str_big(self, albert_base):
|
||||
tokenizer = Tokenizer.from_file(albert_base)
|
||||
assert (
|
||||
str(tokenizer)
|
||||
== """Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"<pad>", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":1, "content":"<unk>", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":2, "content":"[CLS]", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":3, "content":"[SEP]", "single_word":False, "lstrip":False, "rstrip":False, ...}, {"id":4, "content":"[MASK]", "single_word":False, "lstrip":False, "rstrip":False, ...}], normalizer=Sequence(normalizers=[Replace(pattern=String("``"), content="\""), Replace(pattern=String("''"), content="\""), NFKD(), StripAccents(), Lowercase(), ...]), pre_tokenizer=Sequence(pretokenizers=[WhitespaceSplit(), Metaspace(replacement="▁", prepend_scheme=always, split=True)]), post_processor=TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0), Sequence(id=B, type_id=1), SpecialToken(id="[SEP]", type_id=1)], special_tokens={"[CLS]":SpecialToken(id="[CLS]", ids=[2], tokens=["[CLS]"]), "[SEP]":SpecialToken(id="[SEP]", ids=[3], tokens=["[SEP]"])}), decoder=Metaspace(replacement="▁", prepend_scheme=always, split=True), model=Unigram(unk_id=1, vocab=[("<pad>", 0), ("<unk>", 0), ("[CLS]", 0), ("[SEP]", 0), ("[MASK]", 0), ...], byte_fallback=False))"""
|
||||
)
|
||||
|
||||
def test_repr_str(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.add_tokens(["my"])
|
||||
assert (
|
||||
repr(tokenizer)
|
||||
== """Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"my", "single_word":False, "lstrip":False, "rstrip":False, "normalized":True, "special":False}], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))"""
|
||||
)
|
||||
assert (
|
||||
str(tokenizer)
|
||||
== """Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[{"id":0, "content":"my", "single_word":False, "lstrip":False, "rstrip":False, ...}], normalizer=None, pre_tokenizer=None, post_processor=None, decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))"""
|
||||
)
|
||||
|
||||
def test_repr_str_ellipsis(self):
|
||||
model = BPE()
|
||||
assert (
|
||||
repr(model)
|
||||
== """BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[])"""
|
||||
)
|
||||
assert (
|
||||
str(model)
|
||||
== """BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[])"""
|
||||
)
|
||||
|
||||
vocab = [
|
||||
("A", 0.0),
|
||||
("B", -0.01),
|
||||
("C", -0.02),
|
||||
("D", -0.03),
|
||||
("E", -0.04),
|
||||
]
|
||||
# No ellispsis yet
|
||||
model = Unigram(vocab, 0, byte_fallback=False)
|
||||
assert (
|
||||
repr(model)
|
||||
== """Unigram(unk_id=0, vocab=[("A", 0), ("B", -0.01), ("C", -0.02), ("D", -0.03), ("E", -0.04)], byte_fallback=False)"""
|
||||
)
|
||||
assert (
|
||||
str(model)
|
||||
== """Unigram(unk_id=0, vocab=[("A", 0), ("B", -0.01), ("C", -0.02), ("D", -0.03), ("E", -0.04)], byte_fallback=False)"""
|
||||
)
|
||||
|
||||
# Ellispis for longer than 5 elements only on `str`.
|
||||
vocab = [
|
||||
("A", 0.0),
|
||||
("B", -0.01),
|
||||
("C", -0.02),
|
||||
("D", -0.03),
|
||||
("E", -0.04),
|
||||
("F", -0.04),
|
||||
]
|
||||
model = Unigram(vocab, 0, byte_fallback=False)
|
||||
assert (
|
||||
repr(model)
|
||||
== """Unigram(unk_id=0, vocab=[("A", 0), ("B", -0.01), ("C", -0.02), ("D", -0.03), ("E", -0.04), ("F", -0.04)], byte_fallback=False)"""
|
||||
)
|
||||
assert (
|
||||
str(model)
|
||||
== """Unigram(unk_id=0, vocab=[("A", 0), ("B", -0.01), ("C", -0.02), ("D", -0.03), ("E", -0.04), ...], byte_fallback=False)"""
|
||||
)
|
||||
|
||||
|
||||
def check(tokenizer_file) -> bool:
|
||||
with open(tokenizer_file, "r") as f:
|
||||
|
Reference in New Issue
Block a user