mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Python - Update example.py with new API
This commit is contained in:
@ -18,7 +18,7 @@ parser.add_argument("--type", default="gpt2", type=str, help="The type of tokeni
|
||||
parser.add_argument("--file", default=None, type=str, help="The file to encode")
|
||||
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab file")
|
||||
parser.add_argument("--merges", default=None, type=str, help="The merges.txt file")
|
||||
parser.add_argument("--debug", action='store_true', type=bool, help="Verbose output")
|
||||
parser.add_argument("--debug", action='store_true', help="Verbose output")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.type == "gpt2" and args.merges is None:
|
||||
@ -58,9 +58,9 @@ if args.type == "gpt2":
|
||||
# Create a Tokenizer using BPE
|
||||
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
||||
# Use ByteLevel PreTokenizer
|
||||
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=False)
|
||||
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
||||
# Use ByteLevel Decoder
|
||||
tok_r.decoder = decoders.ByteLevel.new()
|
||||
tok_r.decoder = decoders.ByteLevel()
|
||||
elif args.type == "bert":
|
||||
print("Running Bert tokenizer")
|
||||
tok_p = BertTokenizer.from_pretrained(args.vocab)
|
||||
@ -70,16 +70,16 @@ elif args.type == "bert":
|
||||
unk_token="[UNK]",
|
||||
max_input_chars_per_word=100)
|
||||
)
|
||||
tok_r.normalizer = BertNormalizer.new(
|
||||
tok_r.normalizer = BertNormalizer(
|
||||
clean_text=True,
|
||||
handle_chinese_chars=True,
|
||||
strip_accents=True,
|
||||
lowercase=True,
|
||||
)
|
||||
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
||||
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer.new()
|
||||
tok_r.decoder = decoders.WordPiece.new()
|
||||
tok_r.post_processor = BertProcessing.new(
|
||||
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
||||
tok_r.decoder = decoders.WordPiece()
|
||||
tok_r.post_processor = BertProcessing(
|
||||
("[SEP]", tok_r.token_to_id("[SEP]")),
|
||||
("[CLS]", tok_r.token_to_id("[CLS]")),
|
||||
)
|
||||
|
Reference in New Issue
Block a user