Python - Update examples with getter/setter

This commit is contained in:
Anthony MOI
2020-01-07 15:23:11 -05:00
parent 8bbf832842
commit cc33418044
4 changed files with 19 additions and 19 deletions

View File

@ -44,8 +44,8 @@ good_custom = GoodCustom()
good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom) good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
good_decoder = decoders.Decoder.custom(good_custom) good_decoder = decoders.Decoder.custom(good_custom)
tokenizer.with_pre_tokenizer(good_pretok) tokenizer.pre_tokenizer = good_pretok
tokenizer.with_decoder(good_decoder) tokenizer.decoder = good_decoder
print("Tokenization will work with good custom:") print("Tokenization will work with good custom:")
encoding = tokenizer.encode("Hey friend!") encoding = tokenizer.encode("Hey friend!")
@ -60,8 +60,8 @@ bad_custom = BadCustom()
bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom) bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
bad_decoder = decoders.Decoder.custom(bad_custom) bad_decoder = decoders.Decoder.custom(bad_custom)
tokenizer.with_pre_tokenizer(bad_pretok) tokenizer.pre_tokenizer = bad_pretok
tokenizer.with_decoder(bad_decoder) tokenizer.decoder = bad_decoder
try: try:
encoding = tokenizer.encode("Hey friend!") encoding = tokenizer.encode("Hey friend!")
except: except:

View File

@ -58,9 +58,9 @@ if args.type == "gpt2":
# Create a Tokenizer using BPE # Create a Tokenizer using BPE
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
# Use ByteLevel PreTokenizer # Use ByteLevel PreTokenizer
tok_r.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_space=False)) tok_r.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False)
# Use ByteLevel Decoder # Use ByteLevel Decoder
tok_r.with_decoder(decoders.ByteLevel.new()) tok_r.decoder = decoders.ByteLevel.new()
elif args.type == "bert": elif args.type == "bert":
print("Running Bert tokenizer") print("Running Bert tokenizer")
tok_p = BertTokenizer.from_pretrained('bert-base-uncased') tok_p = BertTokenizer.from_pretrained('bert-base-uncased')
@ -70,19 +70,19 @@ elif args.type == "bert":
unk_token="[UNK]", unk_token="[UNK]",
max_input_chars_per_word=100) max_input_chars_per_word=100)
) )
tok_r.with_normalizer(BertNormalizer.new( tok_r.normalizer = BertNormalizer.new(
clean_text=True, clean_text=True,
handle_chinese_chars=True, handle_chinese_chars=True,
strip_accents=True, strip_accents=True,
lowercase=True, lowercase=True,
)) )
# tok_r.with_pre_tokenizer(pre_tokenizers.Whitespace.new()) # tok_r.pre_tokenizer = pre_tokenizers.Whitespace.new()
tok_r.with_pre_tokenizer(pre_tokenizers.BertPreTokenizer.new()) tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer.new()
tok_r.with_decoder(decoders.WordPiece.new()) tok_r.decoder = decoders.WordPiece.new()
tok_r.with_post_processor(BertProcessing.new( tok_r.post_processor = BertProcessing.new(
("[SEP]", tok_r.token_to_id("[SEP]")), ("[SEP]", tok_r.token_to_id("[SEP]")),
("[CLS]", tok_r.token_to_id("[CLS]")), ("[CLS]", tok_r.token_to_id("[CLS]")),
)) )
else: else:
raise Exception(f"Unknown type {args.type}") raise Exception(f"Unknown type {args.type}")

View File

@ -32,14 +32,14 @@ if not files:
tokenizer = Tokenizer(models.WordPiece.empty()) tokenizer = Tokenizer(models.WordPiece.empty())
# Customize all the steps # Customize all the steps
tokenizer.with_normalizer(normalizers.BertNormalizer.new( tokenizer.normalizer = normalizers.BertNormalizer.new(
clean_text=True, clean_text=True,
handle_chinese_chars=True, handle_chinese_chars=True,
strip_accents=True, strip_accents=True,
lowercase=True, lowercase=True,
)) )
tokenizer.with_pre_tokenizer(pre_tokenizers.BertPreTokenizer.new()) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer.new()
tokenizer.with_decoder(decoders.WordPiece.new()) tokenizer.decoder = decoders.WordPiece.new()
# And then train # And then train
trainer = trainers.WordPieceTrainer.new( trainer = trainers.WordPieceTrainer.new(

View File

@ -32,8 +32,8 @@ if not files:
tokenizer = Tokenizer(models.BPE.empty()) tokenizer = Tokenizer(models.BPE.empty())
# Customize pre-tokenization and decoding # Customize pre-tokenization and decoding
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_space=False)) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False)
tokenizer.with_decoder(decoders.ByteLevel.new()) tokenizer.decoder = decoders.ByteLevel.new()
# And then train # And then train
trainer = trainers.BpeTrainer.new( trainer = trainers.BpeTrainer.new(