mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 21:58:18 +00:00
Improve docs and fix tests around training
This commit is contained in:
@@ -1022,6 +1022,45 @@ class Tokenizer:
|
||||
:obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
|
||||
"""
|
||||
pass
|
||||
def train(self, files, trainer=None):
|
||||
"""
|
||||
Train the Tokenizer using the given files.
|
||||
|
||||
Reads the files line by line, while keeping all the whitespace, even new lines.
|
||||
If you want to train from data store in-memory, you can check
|
||||
:meth:`~tokenizers.Tokenizer.train_from_iterator`
|
||||
|
||||
Args:
|
||||
files (:obj:`List[str]`):
|
||||
A list of path to the files that we should use for training
|
||||
|
||||
trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
|
||||
An optional trainer that should be used to train our Model
|
||||
"""
|
||||
pass
|
||||
def train_from_iterator(self, iterator, trainer=None, length=None):
|
||||
"""
|
||||
Train the Tokenizer using the provided iterator.
|
||||
|
||||
You can provide anything that is a Python Iterator
|
||||
|
||||
* A list of sequences :obj:`List[str]`
|
||||
* A generator that yields :obj:`str` or :obj:`List[str]`
|
||||
* A Numpy array of strings
|
||||
* ...
|
||||
|
||||
Args:
|
||||
iterator (:obj:`Iterator`):
|
||||
Any iterator over strings or list of strings
|
||||
|
||||
trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
|
||||
An optional trainer that should be used to train our Model
|
||||
|
||||
length (:obj:`int`, `optional`):
|
||||
The total number of sequences in the iterator. This is used to
|
||||
provide meaningful progress tracking
|
||||
"""
|
||||
pass
|
||||
@property
|
||||
def truncation(self):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user