import argparse import glob from tokenizers import BertWordPieceTokenizer parser = argparse.ArgumentParser() parser.add_argument("--files", default=None, metavar="path", type=str, required=True, help="The files to use as training; accept '**/*.txt' type of patterns \ if enclosed in quotes") parser.add_argument("--out", default="./", type=str, help="Path to the output directory, where the files will be saved") parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files") args = parser.parse_args() files = glob.glob(args.files) if not files: print(f"File does not exist: {args.files}") exit(1) # Initialize an empty tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # And then train trainer = tokenizer.train( files, vocab_size=10000, min_frequency=2, show_progress=True, special_tokens=["[SEP]", '[UNK]', '[CLS]', "", "", ""], limit_alphabet=1000, wordpieces_prefix="##" ) # Save the files tokenizer.save(args.out, args.name)