mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Doc - Update PreTokenizer part of the Pipeline page
This commit is contained in:
@ -30,3 +30,46 @@ class TestPipeline:
|
||||
# START replace_normalizer
|
||||
tokenizer.normalizer = normalizer
|
||||
# END replace_normalizer
|
||||
# START setup_pre_tokenizer
|
||||
from tokenizers.pre_tokenizers import Whitespace
|
||||
|
||||
pre_tokenizer = Whitespace()
|
||||
pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.")
|
||||
# [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)),
|
||||
# ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)),
|
||||
# (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))]
|
||||
# END setup_pre_tokenizer
|
||||
assert pre_tokenizer.pre_tokenize_str("Hello! How are you? I'm fine, thank you.") == [
|
||||
("Hello", (0, 5)),
|
||||
("!", (5, 6)),
|
||||
("How", (7, 10)),
|
||||
("are", (11, 14)),
|
||||
("you", (15, 18)),
|
||||
("?", (18, 19)),
|
||||
("I", (20, 21)),
|
||||
("'", (21, 22)),
|
||||
("m", (22, 23)),
|
||||
("fine", (24, 28)),
|
||||
(",", (28, 29)),
|
||||
("thank", (30, 35)),
|
||||
("you", (36, 39)),
|
||||
(".", (39, 40)),
|
||||
]
|
||||
# START combine_pre_tokenizer
|
||||
from tokenizers import pre_tokenizers
|
||||
from tokenizers.pre_tokenizers import Digits
|
||||
|
||||
pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=True)])
|
||||
pre_tokenizer.pre_tokenize_str("Call 911!")
|
||||
# [("Call", (0, 4)), ("9", (5, 6)), ("1", (6, 7)), ("1", (7, 8)), ("!", (8, 9))]
|
||||
# END combine_pre_tokenizer
|
||||
assert pre_tokenizer.pre_tokenize_str("Call 911!") == [
|
||||
("Call", (0, 4)),
|
||||
("9", (5, 6)),
|
||||
("1", (6, 7)),
|
||||
("1", (7, 8)),
|
||||
("!", (8, 9)),
|
||||
]
|
||||
# START replace_pre_tokenizer
|
||||
tokenizer.pre_tokenizer = pre_tokenizer
|
||||
# END replace_pre_tokenizer
|
||||
|
Reference in New Issue
Block a user