mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 20:58:22 +00:00
Allow initial_alphabet on UnigramTrainer
This commit is contained in:
@@ -46,7 +46,7 @@ class BpeTrainer(Trainer):
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contains more than one character, only the first one
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
@@ -98,7 +98,7 @@ class WordPieceTrainer(Trainer):
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contains more than one character, only the first one
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
@@ -136,6 +136,12 @@ class UnigramTrainer(Trainer):
|
||||
special_tokens: List[Union[str, AddedToken]]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contain more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
|
||||
@@ -193,6 +193,17 @@ impl PyUnigramTrainer {
|
||||
"unk_token" => builder.unk_token(val.extract()?),
|
||||
"max_piece_length" => builder.max_piece_length(val.extract()?),
|
||||
"seed_size" => builder.seed_size(val.extract()?),
|
||||
"initial_alphabet" => {
|
||||
let alphabet: Vec<String> = val.extract()?;
|
||||
builder.initial_alphabet(
|
||||
alphabet
|
||||
.into_iter()
|
||||
.map(|s| s.chars().next())
|
||||
.filter(|c| c.is_some())
|
||||
.map(|c| c.unwrap())
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
"special_tokens" => builder.special_tokens(
|
||||
val.cast_as::<PyList>()?
|
||||
.into_iter()
|
||||
|
||||
Reference in New Issue
Block a user