mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-24 00:59:19 +00:00
50 lines
1.5 KiB
Python
50 lines
1.5 KiB
Python
import argparse
|
|
|
|
from tokenizers import Tokenizer, models, pre_tokenizers
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file")
|
|
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
|
|
args = parser.parse_args()
|
|
|
|
class MyPreTok:
|
|
"""
|
|
This class represents a custom PreTokenizer that will be called
|
|
by `tokenizers` when needed
|
|
"""
|
|
def pre_tokenize(self, sentence):
|
|
if sentence.startswith("Hello"):
|
|
# This will generate an error
|
|
return None
|
|
|
|
# Prepend "Haha"
|
|
return sum([ [ "Haha" ], sentence.split(" ") ], [])
|
|
|
|
|
|
# Create a PreTokenizer from our custom one
|
|
mypretok = MyPreTok()
|
|
pretok = pre_tokenizers.PreTokenizer.custom(mypretok)
|
|
|
|
# Create a Tokenizer using a BPE model
|
|
bpe = models.BPE.from_files(args.vocab, args.merges)
|
|
tokenizer = Tokenizer(bpe)
|
|
|
|
# And attach our PreTokenizer
|
|
tokenizer.with_pre_tokenizer(pretok)
|
|
|
|
|
|
def tokenize(sentence):
|
|
output = [ token.value for token in tokenizer.encode(sentence) ]
|
|
print(f"`{sentence}` tokenized to {output}")
|
|
return output
|
|
|
|
|
|
## Good example
|
|
# Our PreTokenizer has been used as expected
|
|
assert(tokenize("Hey friend") == [ "H", "aha", "Hey", "friend" ])
|
|
|
|
## Bad example
|
|
# In this case, our PreTokenizer returns None instead of a List[str]
|
|
# So it doesn't work as expected, and we get a empty list back, with an error printed
|
|
assert(tokenize("Hello friend") == [])
|