Python - Improve imports

This commit is contained in:
Anthony MOI
2020-01-06 12:03:01 -05:00
parent e1caacfce0
commit 7eebd06409
8 changed files with 45 additions and 6 deletions

View File

@ -6,7 +6,11 @@ import logging
logging.getLogger('transformers').disabled = True
logging.getLogger('transformers.tokenization_utils').disabled = True
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors, normalizers
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import BPE, WordPiece
from tokenizers.processors import BertProcessing
from tokenizers.normalizers import BertNormalizer
from transformers import GPT2Tokenizer, BertTokenizer
parser = argparse.ArgumentParser()
@ -52,7 +56,7 @@ if args.type == "gpt2":
tok_p = GPT2Tokenizer.from_pretrained('gpt2')
# Create a Tokenizer using BPE
tok_r = Tokenizer(models.BPE.from_files(args.vocab, args.merges))
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
# Use ByteLevel PreTokenizer
tok_r.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_space=False))
# Use ByteLevel Decoder
@ -61,12 +65,12 @@ elif args.type == "bert":
print("Running Bert tokenizer")
tok_p = BertTokenizer.from_pretrained('bert-base-uncased')
tok_r = Tokenizer(models.WordPiece.from_files(
tok_r = Tokenizer(WordPiece.from_files(
args.vocab,
unk_token="[UNK]",
max_input_chars_per_word=100)
)
tok_r.with_normalizer(normalizers.BertNormalizer.new(
tok_r.with_normalizer(BertNormalizer.new(
clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
@ -75,7 +79,7 @@ elif args.type == "bert":
# tok_r.with_pre_tokenizer(pre_tokenizers.Whitespace.new())
tok_r.with_pre_tokenizer(pre_tokenizers.BertPreTokenizer.new())
tok_r.with_decoder(decoders.WordPiece.new())
tok_r.with_post_processor(processors.BertProcessing.new(
tok_r.with_post_processor(BertProcessing.new(
("[SEP]", tok_r.token_to_id("[SEP]")),
("[CLS]", tok_r.token_to_id("[CLS]")),
))