Python - Improve documentation for trainers

This commit is contained in:
Anthony MOI
2020-11-20 17:54:53 -05:00
committed by Anthony MOI
parent 64441b54b1
commit 58e1d8de67
3 changed files with 140 additions and 138 deletions

View File

@ -5,37 +5,6 @@ class Trainer:
This class is not supposed to be instantiated directly. Instead, any implementation of a This class is not supposed to be instantiated directly. Instead, any implementation of a
Trainer will return an instance of this class when instantiated. Trainer will return an instance of this class when instantiated.
Args:
vocab_size: unsigned int:
The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int:
The minimum frequency a pair should have in order to be merged.
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]:
A list of special tokens the model should know of.
limit_alphabet: unsigned int:
The maximum different characters to keep in the alphabet.
initial_alphabet: List[str]:
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
continuing_subword_prefix: Optional[str]:
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix: Optional[str]:
A suffix to be used for every subword that is a end-of-word.
Returns:
Trainer
""" """
def __init__( def __init__(
@ -53,31 +22,56 @@ class Trainer:
class BpeTrainer(Trainer): class BpeTrainer(Trainer):
""" """
Capable of training a BPE model Trainer capable of training a BPE model
"""
class UnigramTrainer(Trainer):
"""
Capable of training a Unigram model
Args: Args:
vocab_size: unsigned int: vocab_size (:obj:`int`, `optional`):
The size of the final vocabulary, including all tokens and alphabet. The size of the final vocabulary, including all tokens and alphabet.
show_progress: boolean: min_frequency (:obj:`int`, `optional`):
The minimum frequency a pair should have in order to be merged.
show_progress (:obj:`bool`, `optional`):
Whether to show progress bars while training. Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]: special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
A list of special tokens the model should know of. A list of special tokens the model should know of.
initial_alphabet: List[str]: limit_alphabet (:obj:`int`, `optional`):
The maximum different characters to keep in the alphabet.
initial_alphabet (:obj:`List[str]`, `optional`):
A list of characters to include in the initial alphabet, even A list of characters to include in the initial alphabet, even
if not seen in the training dataset. if not seen in the training dataset.
If the strings contain more than one character, only the first one If the strings contain more than one character, only the first one
is kept. is kept.
Returns: continuing_subword_prefix (:obj:`str`, `optional`):
Trainer A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix (:obj:`str`, `optional`):
A suffix to be used for every subword that is a end-of-word.
"""
class UnigramTrainer(Trainer):
"""
Trainer capable of training a Unigram model
Args:
vocab_size (:obj:`int`):
The size of the final vocabulary, including all tokens and alphabet.
show_progress (:obj:`bool`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`):
A list of special tokens the model should know of.
initial_alphabet (:obj:`List[str]`):
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contain more than one character, only the first one
is kept.
""" """
def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]): def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]):
@ -85,58 +79,53 @@ class UnigramTrainer(Trainer):
class WordLevelTrainer(Trainer): class WordLevelTrainer(Trainer):
""" """
Capable of training a WorldLevel model Trainer capable of training a WorldLevel model
Args: Args:
vocab_size: unsigned int: vocab_size (:obj:`int`, `optional`):
The size of the final vocabulary, including all tokens and alphabet. The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int: min_frequency (:obj:`int`, `optional`):
The minimum frequency a pair should have in order to be merged. The minimum frequency a pair should have in order to be merged.
show_progress: boolean: show_progress (:obj:`bool`, `optional`):
Whether to show progress bars while training. Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]: special_tokens (:obj:`List[Union[str, AddedToken]]`):
A list of special tokens the model should know of. A list of special tokens the model should know of.
Returns:
Trainer
""" """
class WordPieceTrainer(Trainer): class WordPieceTrainer(Trainer):
""" """
Capable of training a WordPiece model Trainer capable of training a WordPiece model
Args: Args:
vocab_size: unsigned int: vocab_size (:obj:`int`, `optional`):
The size of the final vocabulary, including all tokens and alphabet. The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int: min_frequency (:obj:`int`, `optional`):
The minimum frequency a pair should have in order to be merged. The minimum frequency a pair should have in order to be merged.
show_progress: boolean: show_progress (:obj:`bool`, `optional`):
Whether to show progress bars while training. Whether to show progress bars while training.
special_tokens: List[Union[str, AddedToken]]: special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
A list of special tokens the model should know of. A list of special tokens the model should know of.
limit_alphabet: unsigned int: limit_alphabet (:obj:`int`, `optional`):
The maximum different characters to keep in the alphabet. The maximum different characters to keep in the alphabet.
initial_alphabet: List[str]: initial_alphabet (:obj:`List[str]`, `optional`):
A list of characters to include in the initial alphabet, even A list of characters to include in the initial alphabet, even
if not seen in the training dataset. if not seen in the training dataset.
If the strings contain more than one character, only the first one If the strings contain more than one character, only the first one
is kept. is kept.
continuing_subword_prefix: Optional[str]: continuing_subword_prefix (:obj:`str`, `optional`):
A prefix to be used for every subword that is not a beginning-of-word. A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix: Optional[str]: end_of_word_suffix (:obj:`str`, `optional`):
A suffix to be used for every subword that is a end-of-word. A suffix to be used for every subword that is a end-of-word.
Returns:
Trainer
""" """
def __init__( def __init__(

View File

@ -15,38 +15,7 @@ use crate::tokenizer::PyAddedToken;
/// ///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a /// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// Trainer will return an instance of this class when instantiated. /// Trainer will return an instance of this class when instantiated.
/// #[pyclass(name=Trainer, module = "tokenizers.trainers", name=Trainer)]
/// Args:
/// vocab_size: unsigned int:
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// min_frequency: unsigned int:
/// The minimum frequency a pair should have in order to be merged.
///
/// show_progress: boolean:
/// Whether to show progress bars while training.
///
/// special_tokens: List[Union[str, AddedToken]]:
/// A list of special tokens the model should know of.
///
/// limit_alphabet: unsigned int:
/// The maximum different characters to keep in the alphabet.
///
/// initial_alphabet: List[str]:
/// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one
/// is kept.
///
/// continuing_subword_prefix: Optional[str]:
/// A prefix to be used for every subword that is not a beginning-of-word.
///
/// end_of_word_suffix: Optional[str]:
/// A suffix to be used for every subword that is a end-of-word.
///
/// Returns:
/// Trainer
#[pyclass(name=Trainer)]
#[derive(Clone)] #[derive(Clone)]
#[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"] #[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"]
pub struct PyTrainer { pub struct PyTrainer {
@ -108,15 +77,39 @@ where
} }
} }
/// Capable of training a BPE model /// Trainer capable of training a BPE model
#[pyclass(extends=PyTrainer, name=BpeTrainer)] ///
/// Args:
/// vocab_size (:obj:`int`, `optional`):
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// min_frequency (:obj:`int`, `optional`):
/// The minimum frequency a pair should have in order to be merged.
///
/// show_progress (:obj:`bool`, `optional`):
/// Whether to show progress bars while training.
///
/// special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
/// A list of special tokens the model should know of.
///
/// limit_alphabet (:obj:`int`, `optional`):
/// The maximum different characters to keep in the alphabet.
///
/// initial_alphabet (:obj:`List[str]`, `optional`):
/// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one
/// is kept.
///
/// continuing_subword_prefix (:obj:`str`, `optional`):
/// A prefix to be used for every subword that is not a beginning-of-word.
///
/// end_of_word_suffix (:obj:`str`, `optional`):
/// A suffix to be used for every subword that is a end-of-word.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=BpeTrainer)]
pub struct PyBpeTrainer {} pub struct PyBpeTrainer {}
#[pymethods] #[pymethods]
impl PyBpeTrainer { impl PyBpeTrainer {
/// new(/ vocab_size, min_frequency)
/// --
///
/// Create a new BpeTrainer with the given configuration
#[new] #[new]
#[args(kwargs = "**")] #[args(kwargs = "**")]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
@ -176,46 +169,40 @@ impl PyBpeTrainer {
} }
} }
/// Capable of training a WordPiece model /// Trainer capable of training a WordPiece model
///
/// Args: /// Args:
/// vocab_size: unsigned int: /// vocab_size (:obj:`int`, `optional`):
/// The size of the final vocabulary, including all tokens and alphabet. /// The size of the final vocabulary, including all tokens and alphabet.
/// ///
/// min_frequency: unsigned int: /// min_frequency (:obj:`int`, `optional`):
/// The minimum frequency a pair should have in order to be merged. /// The minimum frequency a pair should have in order to be merged.
/// ///
/// show_progress: boolean: /// show_progress (:obj:`bool`, `optional`):
/// Whether to show progress bars while training. /// Whether to show progress bars while training.
/// ///
/// special_tokens: List[Union[str, AddedToken]]: /// special_tokens (:obj:`List[Union[str, AddedToken]]`, `optional`):
/// A list of special tokens the model should know of. /// A list of special tokens the model should know of.
/// ///
/// limit_alphabet: unsigned int: /// limit_alphabet (:obj:`int`, `optional`):
/// The maximum different characters to keep in the alphabet. /// The maximum different characters to keep in the alphabet.
/// ///
/// initial_alphabet: List[str]: /// initial_alphabet (:obj:`List[str]`, `optional`):
/// A list of characters to include in the initial alphabet, even /// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset. /// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one /// If the strings contain more than one character, only the first one
/// is kept. /// is kept.
/// ///
/// continuing_subword_prefix: Optional[str]: /// continuing_subword_prefix (:obj:`str`, `optional`):
/// A prefix to be used for every subword that is not a beginning-of-word. /// A prefix to be used for every subword that is not a beginning-of-word.
/// ///
/// end_of_word_suffix: Optional[str]: /// end_of_word_suffix (:obj:`str`, `optional`):
/// A suffix to be used for every subword that is a end-of-word. /// A suffix to be used for every subword that is a end-of-word.
/// #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordPieceTrainer)]
/// Returns:
/// Trainer
#[pyclass(extends=PyTrainer, name=WordPieceTrainer)]
#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"] #[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
pub struct PyWordPieceTrainer {} pub struct PyWordPieceTrainer {}
#[pymethods] #[pymethods]
impl PyWordPieceTrainer { impl PyWordPieceTrainer {
/// new(/ vocab_size, min_frequency)
/// --
///
/// Create a new BpeTrainer with the given configuration
#[new] #[new]
#[args(kwargs = "**")] #[args(kwargs = "**")]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
@ -276,28 +263,24 @@ impl PyWordPieceTrainer {
} }
} }
/// Capable of training a WorldLevel model /// Trainer capable of training a WorldLevel model
/// ///
/// Args: /// Args:
/// vocab_size: unsigned int: /// vocab_size (:obj:`int`, `optional`):
/// The size of the final vocabulary, including all tokens and alphabet. /// The size of the final vocabulary, including all tokens and alphabet.
/// ///
/// min_frequency: unsigned int: /// min_frequency (:obj:`int`, `optional`):
/// The minimum frequency a pair should have in order to be merged. /// The minimum frequency a pair should have in order to be merged.
/// ///
/// show_progress: boolean: /// show_progress (:obj:`bool`, `optional`):
/// Whether to show progress bars while training. /// Whether to show progress bars while training.
/// ///
/// special_tokens: List[Union[str, AddedToken]]: /// special_tokens (:obj:`List[Union[str, AddedToken]]`):
/// A list of special tokens the model should know of. /// A list of special tokens the model should know of.
/// #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordLevelTrainer)]
/// Returns:
/// Trainer
#[pyclass(extends=PyTrainer, name=WordLevelTrainer)]
pub struct PyWordLevelTrainer {} pub struct PyWordLevelTrainer {}
#[pymethods] #[pymethods]
impl PyWordLevelTrainer { impl PyWordLevelTrainer {
/// Create a new WordLevelTrainer with the given configuration
#[new] #[new]
#[args(kwargs = "**")] #[args(kwargs = "**")]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
@ -354,32 +337,28 @@ impl PyWordLevelTrainer {
} }
} }
/// Capable of training a Unigram model /// Trainer capable of training a Unigram model
/// ///
/// Args: /// Args:
/// vocab_size: unsigned int: /// vocab_size (:obj:`int`):
/// The size of the final vocabulary, including all tokens and alphabet. /// The size of the final vocabulary, including all tokens and alphabet.
/// ///
/// show_progress: boolean: /// show_progress (:obj:`bool`):
/// Whether to show progress bars while training. /// Whether to show progress bars while training.
/// ///
/// special_tokens: List[Union[str, AddedToken]]: /// special_tokens (:obj:`List[Union[str, AddedToken]]`):
/// A list of special tokens the model should know of. /// A list of special tokens the model should know of.
/// ///
/// initial_alphabet: List[str]: /// initial_alphabet (:obj:`List[str]`):
/// A list of characters to include in the initial alphabet, even /// A list of characters to include in the initial alphabet, even
/// if not seen in the training dataset. /// if not seen in the training dataset.
/// If the strings contain more than one character, only the first one /// If the strings contain more than one character, only the first one
/// is kept. /// is kept.
/// #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=UnigramTrainer)]
/// Returns:
/// Trainer
#[pyclass(extends=PyTrainer, name=UnigramTrainer)]
#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"] #[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"]
pub struct PyUnigramTrainer {} pub struct PyUnigramTrainer {}
#[pymethods] #[pymethods]
impl PyUnigramTrainer { impl PyUnigramTrainer {
/// Create a new UnigramTrainer with the given configuration
#[new] #[new]
#[args(kwargs = "**")] #[args(kwargs = "**")]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {

View File

@ -44,3 +44,37 @@ Added Tokens
.. autoclass:: tokenizers.AddedToken .. autoclass:: tokenizers.AddedToken
:members: :members:
Models
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: tokenizers.models
:members:
Normalizers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: tokenizers.normalizers
:members:
Pre-tokenizers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: tokenizers.pre_tokenizers
:members:
Post-processor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: tokenizers.processors
:members:
Trainers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. automodule:: tokenizers.trainers
:members: