mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
70 lines
2.0 KiB
Python
70 lines
2.0 KiB
Python
import argparse
|
|
|
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab.json file")
|
|
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
|
|
args = parser.parse_args()
|
|
|
|
class GoodCustom:
|
|
"""GoodCustom
|
|
This class represents a good custom PreTokenizer that will be called
|
|
by `tokenizers` when needed
|
|
"""
|
|
def pre_tokenize(self, sentence):
|
|
return sentence.split(" ")
|
|
|
|
def decode(self, tokens):
|
|
return ", ".join(tokens)
|
|
|
|
class BadCustom:
|
|
"""Bad Pretok
|
|
This class represents a bad custom PreTokenizer that will trigger an exception
|
|
when called by `tokenizers`
|
|
"""
|
|
def pre_tokenize(self, sentence):
|
|
return None
|
|
|
|
def decode(self, tokens):
|
|
return None
|
|
|
|
def tokenize(sentence):
|
|
output = tokenizer.encode(sentence).tokens
|
|
print(f"`{sentence}` tokenized to {output}")
|
|
return output
|
|
|
|
|
|
# Create a Tokenizer using a BPE model
|
|
bpe = models.BPE.from_files(args.vocab, args.merges)
|
|
tokenizer = Tokenizer(models.BPE.from_files(args.vocab, args.merges))
|
|
|
|
# Test the good custom classes
|
|
good_custom = GoodCustom()
|
|
good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
|
|
good_decoder = decoders.Decoder.custom(good_custom)
|
|
|
|
tokenizer.with_pre_tokenizer(good_pretok)
|
|
tokenizer.with_decoder(good_decoder)
|
|
|
|
print("Tokenization will work with good custom:")
|
|
encoding = tokenizer.encode("Hey friend!")
|
|
print(f"IDS: {encoding.ids}")
|
|
print(f"TOKENS: {encoding.tokens}")
|
|
print(f"OFFSETS: {encoding.offsets}")
|
|
decoded = tokenizer.decode(encoding.ids)
|
|
print(f"DECODED: {decoded}")
|
|
|
|
# Now test with the bad custom classes
|
|
bad_custom = BadCustom()
|
|
bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
|
|
bad_decoder = decoders.Decoder.custom(bad_custom)
|
|
|
|
tokenizer.with_pre_tokenizer(bad_pretok)
|
|
tokenizer.with_decoder(bad_decoder)
|
|
try:
|
|
encoding = tokenizer.encode("Hey friend!")
|
|
except:
|
|
print("Bad tokenizer didn't work")
|
|
|