mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Doc - Update Normalizer part of the Pipeline page
This commit is contained in:
32
bindings/python/tests/documentation/test_pipeline.py
Normal file
32
bindings/python/tests/documentation/test_pipeline.py
Normal file
@ -0,0 +1,32 @@
|
||||
from ..utils import data_dir, doc_wiki_tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
|
||||
class TestPipeline:
|
||||
def test_pipeline(self, doc_wiki_tokenizer):
|
||||
def print(*args, **kwargs):
|
||||
pass
|
||||
|
||||
try:
|
||||
# START reload_tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
tokenizer = Tokenizer.from_file("data/tokenizer-wiki.json")
|
||||
# END reload_tokenizer
|
||||
except Exception:
|
||||
tokenizer = Tokenizer.from_file(doc_wiki_tokenizer)
|
||||
|
||||
# START setup_normalizer
|
||||
from tokenizers import normalizers
|
||||
from tokenizers.normalizers import NFD, StripAccents
|
||||
|
||||
normalizer = normalizers.Sequence([NFD(), StripAccents()])
|
||||
# END setup_normalizer
|
||||
# START test_normalizer
|
||||
normalizer.normalize_str("Héllò hôw are ü?")
|
||||
# "Hello how are u?"
|
||||
# END test_normalizer
|
||||
assert normalizer.normalize_str("Héllò hôw are ü?") == "Hello how are u?"
|
||||
# START replace_normalizer
|
||||
tokenizer.normalizer = normalizer
|
||||
# END replace_normalizer
|
Reference in New Issue
Block a user