mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 08:45:38 +00:00
Python - Black auto formatting
This commit is contained in:
@ -4,21 +4,24 @@ import glob
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--files",
|
||||
default=None,
|
||||
metavar="path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The files to use as training; accept '**/*.txt' type of patterns \
|
||||
if enclosed in quotes")
|
||||
parser.add_argument("--out",
|
||||
default="./",
|
||||
type=str,
|
||||
help="Path to the output directory, where the files will be saved")
|
||||
parser.add_argument("--name",
|
||||
default="bert-wordpiece",
|
||||
type=str,
|
||||
help="The name of the output vocab files")
|
||||
parser.add_argument(
|
||||
"--files",
|
||||
default=None,
|
||||
metavar="path",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The files to use as training; accept '**/*.txt' type of patterns \
|
||||
if enclosed in quotes",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--out",
|
||||
default="./",
|
||||
type=str,
|
||||
help="Path to the output directory, where the files will be saved",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--name", default="bert-wordpiece", type=str, help="The name of the output vocab files"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
files = glob.glob(args.files)
|
||||
@ -29,11 +32,7 @@ if not files:
|
||||
|
||||
# Initialize an empty tokenizer
|
||||
tokenizer = BertWordPieceTokenizer(
|
||||
clean_text=True,
|
||||
handle_chinese_chars=True,
|
||||
strip_accents=True,
|
||||
lowercase=True,
|
||||
|
||||
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
|
||||
)
|
||||
|
||||
# And then train
|
||||
@ -44,7 +43,7 @@ trainer = tokenizer.train(
|
||||
show_progress=True,
|
||||
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
|
||||
limit_alphabet=1000,
|
||||
wordpieces_prefix="##"
|
||||
wordpieces_prefix="##",
|
||||
)
|
||||
|
||||
# Save the files
|
||||
|
Reference in New Issue
Block a user