Python - PoC Custom PreTokenizer

This commit is contained in:
Anthony MOI
2019-11-24 00:52:13 -05:00
parent b081e6ca04
commit 6437c40235
3 changed files with 87 additions and 9 deletions

View File

@ -0,0 +1,49 @@
import argparse
from tokenizers import Tokenizer, models, pre_tokenizers
parser = argparse.ArgumentParser()
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file")
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
args = parser.parse_args()
class MyPreTok:
"""
This class represents a custom PreTokenizer that will be called
by `tokenizers` when needed
"""
def pre_tokenize(self, sentence):
if sentence.startswith("Hello"):
# This will generate an error
return None
# Prepend "Haha"
return sum([ [ "Haha" ], sentence.split(" ") ], [])
# Create a PreTokenizer from our custom one
mypretok = MyPreTok()
pretok = pre_tokenizers.PreTokenizer.custom(mypretok)
# Create a Tokenizer using a BPE model
bpe = models.BPE.from_files(args.vocab, args.merges)
tokenizer = Tokenizer(bpe)
# And attach our PreTokenizer
tokenizer.with_pre_tokenizer(pretok)
def tokenize(sentence):
output = [ token.value for token in tokenizer.encode(sentence) ]
print(f"`{sentence}` tokenized to {output}")
return output
## Good example
# Our PreTokenizer has been used as expected
assert(tokenize("Hey friend") == [ "H", "aha", "Hey", "friend" ])
## Bad example
# In this case, our PreTokenizer returns None instead of a List[str]
# So it doesn't work as expected, and we get a empty list back, with an error printed
assert(tokenize("Hello friend") == [])