mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-24 00:59:19 +00:00
Improve docs and fix tests around training
This commit is contained in:
@ -15,5 +15,6 @@ def batch_iterator():
|
||||
for i in range(0, len(dataset["train"]), batch_length):
|
||||
yield dataset["train"][i : i + batch_length]["text"]
|
||||
|
||||
|
||||
# And finally train
|
||||
bpe_tokenizer.train_from_iterator(batch_iterator(), length=len(dataset["train"]))
|
||||
|
Reference in New Issue
Block a user