mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-24 00:59:19 +00:00
Python - Update examples with getter/setter
This commit is contained in:
@ -44,8 +44,8 @@ good_custom = GoodCustom()
|
|||||||
good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
|
good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
|
||||||
good_decoder = decoders.Decoder.custom(good_custom)
|
good_decoder = decoders.Decoder.custom(good_custom)
|
||||||
|
|
||||||
tokenizer.with_pre_tokenizer(good_pretok)
|
tokenizer.pre_tokenizer = good_pretok
|
||||||
tokenizer.with_decoder(good_decoder)
|
tokenizer.decoder = good_decoder
|
||||||
|
|
||||||
print("Tokenization will work with good custom:")
|
print("Tokenization will work with good custom:")
|
||||||
encoding = tokenizer.encode("Hey friend!")
|
encoding = tokenizer.encode("Hey friend!")
|
||||||
@ -60,8 +60,8 @@ bad_custom = BadCustom()
|
|||||||
bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
|
bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
|
||||||
bad_decoder = decoders.Decoder.custom(bad_custom)
|
bad_decoder = decoders.Decoder.custom(bad_custom)
|
||||||
|
|
||||||
tokenizer.with_pre_tokenizer(bad_pretok)
|
tokenizer.pre_tokenizer = bad_pretok
|
||||||
tokenizer.with_decoder(bad_decoder)
|
tokenizer.decoder = bad_decoder
|
||||||
try:
|
try:
|
||||||
encoding = tokenizer.encode("Hey friend!")
|
encoding = tokenizer.encode("Hey friend!")
|
||||||
except:
|
except:
|
||||||
|
@ -58,9 +58,9 @@ if args.type == "gpt2":
|
|||||||
# Create a Tokenizer using BPE
|
# Create a Tokenizer using BPE
|
||||||
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
||||||
# Use ByteLevel PreTokenizer
|
# Use ByteLevel PreTokenizer
|
||||||
tok_r.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_space=False))
|
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False)
|
||||||
# Use ByteLevel Decoder
|
# Use ByteLevel Decoder
|
||||||
tok_r.with_decoder(decoders.ByteLevel.new())
|
tok_r.decoder = decoders.ByteLevel.new()
|
||||||
elif args.type == "bert":
|
elif args.type == "bert":
|
||||||
print("Running Bert tokenizer")
|
print("Running Bert tokenizer")
|
||||||
tok_p = BertTokenizer.from_pretrained('bert-base-uncased')
|
tok_p = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||||
@ -70,19 +70,19 @@ elif args.type == "bert":
|
|||||||
unk_token="[UNK]",
|
unk_token="[UNK]",
|
||||||
max_input_chars_per_word=100)
|
max_input_chars_per_word=100)
|
||||||
)
|
)
|
||||||
tok_r.with_normalizer(BertNormalizer.new(
|
tok_r.normalizer = BertNormalizer.new(
|
||||||
clean_text=True,
|
clean_text=True,
|
||||||
handle_chinese_chars=True,
|
handle_chinese_chars=True,
|
||||||
strip_accents=True,
|
strip_accents=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
))
|
)
|
||||||
# tok_r.with_pre_tokenizer(pre_tokenizers.Whitespace.new())
|
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
||||||
tok_r.with_pre_tokenizer(pre_tokenizers.BertPreTokenizer.new())
|
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer.new()
|
||||||
tok_r.with_decoder(decoders.WordPiece.new())
|
tok_r.decoder = decoders.WordPiece.new()
|
||||||
tok_r.with_post_processor(BertProcessing.new(
|
tok_r.post_processor = BertProcessing.new(
|
||||||
("[SEP]", tok_r.token_to_id("[SEP]")),
|
("[SEP]", tok_r.token_to_id("[SEP]")),
|
||||||
("[CLS]", tok_r.token_to_id("[CLS]")),
|
("[CLS]", tok_r.token_to_id("[CLS]")),
|
||||||
))
|
)
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Unknown type {args.type}")
|
raise Exception(f"Unknown type {args.type}")
|
||||||
|
|
||||||
|
@ -32,14 +32,14 @@ if not files:
|
|||||||
tokenizer = Tokenizer(models.WordPiece.empty())
|
tokenizer = Tokenizer(models.WordPiece.empty())
|
||||||
|
|
||||||
# Customize all the steps
|
# Customize all the steps
|
||||||
tokenizer.with_normalizer(normalizers.BertNormalizer.new(
|
tokenizer.normalizer = normalizers.BertNormalizer.new(
|
||||||
clean_text=True,
|
clean_text=True,
|
||||||
handle_chinese_chars=True,
|
handle_chinese_chars=True,
|
||||||
strip_accents=True,
|
strip_accents=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
))
|
)
|
||||||
tokenizer.with_pre_tokenizer(pre_tokenizers.BertPreTokenizer.new())
|
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer.new()
|
||||||
tokenizer.with_decoder(decoders.WordPiece.new())
|
tokenizer.decoder = decoders.WordPiece.new()
|
||||||
|
|
||||||
# And then train
|
# And then train
|
||||||
trainer = trainers.WordPieceTrainer.new(
|
trainer = trainers.WordPieceTrainer.new(
|
||||||
|
@ -32,8 +32,8 @@ if not files:
|
|||||||
tokenizer = Tokenizer(models.BPE.empty())
|
tokenizer = Tokenizer(models.BPE.empty())
|
||||||
|
|
||||||
# Customize pre-tokenization and decoding
|
# Customize pre-tokenization and decoding
|
||||||
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_space=False))
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False)
|
||||||
tokenizer.with_decoder(decoders.ByteLevel.new())
|
tokenizer.decoder = decoders.ByteLevel.new()
|
||||||
|
|
||||||
# And then train
|
# And then train
|
||||||
trainer = trainers.BpeTrainer.new(
|
trainer = trainers.BpeTrainer.new(
|
||||||
|
Reference in New Issue
Block a user