mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
style
This commit is contained in:
@ -376,7 +376,13 @@ class TestTokenizer:
|
|||||||
|
|
||||||
# Can retrieve added token decoder
|
# Can retrieve added token decoder
|
||||||
vocab = tokenizer.get_added_tokens_decoder()
|
vocab = tokenizer.get_added_tokens_decoder()
|
||||||
assert vocab == {0: AddedToken("my", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),1: AddedToken("name", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),2: AddedToken("is", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),3: AddedToken("john", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),4: AddedToken("pair", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False)}
|
assert vocab == {
|
||||||
|
0: AddedToken("my", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
|
||||||
|
1: AddedToken("name", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
|
||||||
|
2: AddedToken("is", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
|
||||||
|
3: AddedToken("john", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
|
||||||
|
4: AddedToken("pair", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
|
||||||
|
}
|
||||||
|
|
||||||
def test_get_vocab_size(self):
|
def test_get_vocab_size(self):
|
||||||
tokenizer = Tokenizer(BPE())
|
tokenizer = Tokenizer(BPE())
|
||||||
|
@ -34,8 +34,8 @@ class TestBpeTrainer:
|
|||||||
assert trainer.min_frequency == 12
|
assert trainer.min_frequency == 12
|
||||||
assert trainer.show_progress == False
|
assert trainer.show_progress == False
|
||||||
assert trainer.special_tokens == [
|
assert trainer.special_tokens == [
|
||||||
AddedToken("1", special = True),
|
AddedToken("1", special=True),
|
||||||
AddedToken("2", special = True),
|
AddedToken("2", special=True),
|
||||||
]
|
]
|
||||||
assert trainer.limit_alphabet == 13
|
assert trainer.limit_alphabet == 13
|
||||||
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
||||||
@ -91,8 +91,8 @@ class TestWordPieceTrainer:
|
|||||||
assert trainer.min_frequency == 12
|
assert trainer.min_frequency == 12
|
||||||
assert trainer.show_progress == False
|
assert trainer.show_progress == False
|
||||||
assert trainer.special_tokens == [
|
assert trainer.special_tokens == [
|
||||||
AddedToken("1", special = True),
|
AddedToken("1", special=True),
|
||||||
AddedToken("2", special = True),
|
AddedToken("2", special=True),
|
||||||
]
|
]
|
||||||
assert trainer.limit_alphabet == 13
|
assert trainer.limit_alphabet == 13
|
||||||
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
||||||
@ -131,8 +131,8 @@ class TestWordLevelTrainer:
|
|||||||
assert trainer.min_frequency == 12
|
assert trainer.min_frequency == 12
|
||||||
assert trainer.show_progress == False
|
assert trainer.show_progress == False
|
||||||
assert trainer.special_tokens == [
|
assert trainer.special_tokens == [
|
||||||
AddedToken("1", special = True),
|
AddedToken("1", special=True),
|
||||||
AddedToken("2", special = True),
|
AddedToken("2", special=True),
|
||||||
]
|
]
|
||||||
|
|
||||||
# Modify these
|
# Modify these
|
||||||
@ -272,8 +272,8 @@ class TestUnigram:
|
|||||||
assert trainer.vocab_size == 12345
|
assert trainer.vocab_size == 12345
|
||||||
assert trainer.show_progress == False
|
assert trainer.show_progress == False
|
||||||
assert trainer.special_tokens == [
|
assert trainer.special_tokens == [
|
||||||
AddedToken("1", normalized=False, special = True),
|
AddedToken("1", normalized=False, special=True),
|
||||||
AddedToken("2", lstrip=True, normalized=False, special = True),
|
AddedToken("2", lstrip=True, normalized=False, special=True),
|
||||||
]
|
]
|
||||||
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
assert sorted(trainer.initial_alphabet) == ["a", "b", "c"]
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user