Files
tokenizers/bindings/python/examples/train_bert_wordpiece.py
Morgan Funtowicz 894f887444 Updated train_bert_wordpiece.py as well.
Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
2020-01-14 13:32:02 +01:00

52 lines
1.3 KiB
Python

import argparse
import glob
from tokenizers import BertWordPieceTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("--files",
default=None,
metavar="path",
type=str,
required=True,
help="The files to use as training; accept '**/*.txt' type of patterns \
if enclosed in quotes")
parser.add_argument("--out",
default="./",
type=str,
help="Path to the output directory, where the files will be saved")
parser.add_argument("--name",
default="bert-wordpiece",
type=str,
help="The name of the output vocab files")
args = parser.parse_args()
files = glob.glob(args.files)
if not files:
print(f"File does not exist: {args.files}")
exit(1)
# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
)
# And then train
trainer = tokenizer.train(
files,
vocab_size=10000,
min_frequency=2,
show_progress=True,
special_tokens=["[SEP]", '[UNK]', '[CLS]', "<s>", "<pad>", "</s>"],
limit_alphabet=1000,
wordpieces_prefix="##"
)
# Save the files
tokenizer.save(args.out, args.name)