mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Python - Add bert wordpiece training example
This commit is contained in:
57
bindings/python/examples/train_bert_wordpiece.py
Normal file
57
bindings/python/examples/train_bert_wordpiece.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import argparse
|
||||||
|
import glob
|
||||||
|
|
||||||
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, normalizers
|
||||||
|
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--files",
|
||||||
|
default=None,
|
||||||
|
metavar="path",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="The files to use as training; accept '**/*.txt' type of patterns \
|
||||||
|
if enclosed in quotes")
|
||||||
|
parser.add_argument("--out",
|
||||||
|
default="./",
|
||||||
|
type=str,
|
||||||
|
help="Path to the output directory, where the files will be saved")
|
||||||
|
parser.add_argument("--name",
|
||||||
|
default="bert-wordpiece",
|
||||||
|
type=str,
|
||||||
|
help="The name of the output vocab files")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
files = glob.glob(args.files)
|
||||||
|
if not files:
|
||||||
|
print(f"File does not exist: {args.files}")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize an empty tokenizer
|
||||||
|
tokenizer = Tokenizer(models.WordPiece.empty())
|
||||||
|
|
||||||
|
# Customize all the steps
|
||||||
|
tokenizer.with_normalizer(normalizers.BertNormalizer.new(
|
||||||
|
clean_text=True,
|
||||||
|
handle_chinese_chars=True,
|
||||||
|
strip_accents=True,
|
||||||
|
lowercase=True,
|
||||||
|
))
|
||||||
|
tokenizer.with_pre_tokenizer(pre_tokenizers.BertPreTokenizer.new())
|
||||||
|
tokenizer.with_decoder(decoders.WordPiece.new())
|
||||||
|
|
||||||
|
# And then train
|
||||||
|
trainer = trainers.WordPieceTrainer.new(
|
||||||
|
vocab_size=50000,
|
||||||
|
min_frequency=2,
|
||||||
|
show_progress=True,
|
||||||
|
special_tokens=[ "<s>", "<unk>", "<pad>", "</s>" ],
|
||||||
|
limit_alphabet=1000,
|
||||||
|
continuing_subword_prefix="##"
|
||||||
|
)
|
||||||
|
tokenizer.train(trainer, files)
|
||||||
|
|
||||||
|
# Save the files
|
||||||
|
tokenizer.model.save(args.out, args.name)
|
||||||
|
|
@ -40,7 +40,7 @@ trainer = trainers.BpeTrainer.new(
|
|||||||
vocab_size=50000,
|
vocab_size=50000,
|
||||||
min_frequency=2,
|
min_frequency=2,
|
||||||
show_progress=True,
|
show_progress=True,
|
||||||
special_tokens=[ "<s>", "<pad>", "</s" ],
|
special_tokens=[ "<s>", "<pad>", "</s>" ],
|
||||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
|
||||||
)
|
)
|
||||||
tokenizer.train(trainer, files)
|
tokenizer.train(trainer, files)
|
||||||
|
@ -133,4 +133,11 @@ impl WordPiece {
|
|||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[staticmethod]
|
||||||
|
fn empty() -> Model {
|
||||||
|
Model {
|
||||||
|
model: Container::Owned(Box::new(tk::models::wordpiece::WordPiece::default())),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user