Python - Add bert wordpiece training example

2025-08-22 16:25:30 +00:00 · 2020-01-03 16:51:39 -05:00
parent 6e3efe8954
commit fab4e96b51
3 changed files with 65 additions and 1 deletions
--- a/bindings/python/examples/train_bert_wordpiece.py
+++ b/bindings/python/examples/train_bert_wordpiece.py
@ -0,0 +1,57 @@
+import argparse
+import glob
+
+from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, normalizers
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--files",
+                    default=None,
+                    metavar="path",
+                    type=str,
+                    required=True,
+                    help="The files to use as training; accept '**/*.txt' type of patterns \
+                          if enclosed in quotes")
+parser.add_argument("--out",
+                    default="./",
+                    type=str,
+                    help="Path to the output directory, where the files will be saved")
+parser.add_argument("--name",
+                    default="bert-wordpiece",
+                    type=str,
+                    help="The name of the output vocab files")
+args = parser.parse_args()
+
+files = glob.glob(args.files)
+if not files:
+    print(f"File does not exist: {args.files}")
+    exit(1)
+
+
+# Initialize an empty tokenizer
+tokenizer = Tokenizer(models.WordPiece.empty())
+
+# Customize all the steps
+tokenizer.with_normalizer(normalizers.BertNormalizer.new(
+    clean_text=True,
+    handle_chinese_chars=True,
+    strip_accents=True,
+    lowercase=True,
+))
+tokenizer.with_pre_tokenizer(pre_tokenizers.BertPreTokenizer.new())
+tokenizer.with_decoder(decoders.WordPiece.new())
+
+# And then train
+trainer = trainers.WordPieceTrainer.new(
+    vocab_size=50000,
+    min_frequency=2,
+    show_progress=True,
+    special_tokens=[ "<s>", "<unk>", "<pad>", "</s>" ],
+    limit_alphabet=1000,
+    continuing_subword_prefix="##"
+)
+tokenizer.train(trainer, files)
+
+# Save the files
+tokenizer.model.save(args.out, args.name)
+
--- a/bindings/python/examples/train_bytelevel_bpe.py
+++ b/bindings/python/examples/train_bytelevel_bpe.py
@ -40,7 +40,7 @@ trainer = trainers.BpeTrainer.new(
    vocab_size=50000,
    min_frequency=2,
    show_progress=True,
-    special_tokens=[ "<s>", "<pad>", "</s" ],
+    special_tokens=[ "<s>", "<pad>", "</s>" ],
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet()
 )
 tokenizer.train(trainer, files)
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@ -133,4 +133,11 @@ impl WordPiece {
            }),
        }
    }
+
+    #[staticmethod]
+    fn empty() -> Model {
+        Model {
+            model: Container::Owned(Box::new(tk::models::wordpiece::WordPiece::default())),
+        }
+    }
 }