mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-01 14:59:20 +00:00
Add unigram bytefallback (#1217)
* current updates will go red * cargo fmt * npm install * refactor train for unigram to allow bytefallbakc (breaking) * fmt * nits * update * add a proper test * fix encode optimised fallback + add trainer arg * fixes * fixes * fix tests * add test * fmt * fix rust test * update python bindings * update * pub is okay and needed * more fix * cleanup * remove useles id * MissingUnkId error * nits * fix offset * add a test in python * update src bindings * remove bytefallback from trainer * styling * update pckg * lint * fmt * stup with dev * update code based on review * remove unused function * udpate python test to compare ids * fix option bool issues * final fix * clippy * fix npm isntall * update * update test * more in depth testing * Lint * last attempt to fix node * update node bindings * fmt * Update tokenizers/src/models/unigram/model.rs Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> * update based on review * simpler test * lint --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -5,7 +5,7 @@ import pytest
|
||||
|
||||
from tokenizers import AddedToken, Encoding, Tokenizer
|
||||
from tokenizers.implementations import BertWordPieceTokenizer
|
||||
from tokenizers.models import BPE, Model, WordPiece
|
||||
from tokenizers.models import BPE, Model, WordPiece, Unigram
|
||||
from tokenizers.normalizers import Lowercase
|
||||
from tokenizers.pre_tokenizers import ByteLevel
|
||||
from tokenizers.processors import BertProcessing, RobertaProcessing
|
||||
@ -412,3 +412,29 @@ class TestTokenizer:
|
||||
tokenizer = Tokenizer.from_pretrained("anthony/tokenizers-test", revision="gpt-2")
|
||||
output = tokenizer.encode("Hey there dear friend!", add_special_tokens=False)
|
||||
assert output.tokens == ["Hey", "Ġthere", "Ġdear", "Ġfriend", "!"]
|
||||
|
||||
def test_unigram_byte_fallback(self):
|
||||
vocab = [
|
||||
("<unk>", 0.0),
|
||||
("A", -0.01),
|
||||
("sen", -0.02),
|
||||
("te", -0.03),
|
||||
("n", -0.04),
|
||||
("ce", -0.05),
|
||||
("<0xF0>", -0.06),
|
||||
("<0x9F>", -0.06),
|
||||
("<0xA4>", -0.06),
|
||||
("<0x97>", -0.06),
|
||||
(" ", -0.4),
|
||||
]
|
||||
tokenizer = tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=False))
|
||||
|
||||
output = tokenizer.encode("A sentence 🤗")
|
||||
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 0]
|
||||
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "🤗"]
|
||||
|
||||
tokenizer = Tokenizer(Unigram(vocab, 0, byte_fallback=True))
|
||||
|
||||
output = tokenizer.encode("A sentence 🤗")
|
||||
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
|
||||
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
|
||||
|
Reference in New Issue
Block a user