mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Allow initial_alphabet on UnigramTrainer
This commit is contained in:
@ -193,6 +193,17 @@ impl PyUnigramTrainer {
|
||||
"unk_token" => builder.unk_token(val.extract()?),
|
||||
"max_piece_length" => builder.max_piece_length(val.extract()?),
|
||||
"seed_size" => builder.seed_size(val.extract()?),
|
||||
"initial_alphabet" => {
|
||||
let alphabet: Vec<String> = val.extract()?;
|
||||
builder.initial_alphabet(
|
||||
alphabet
|
||||
.into_iter()
|
||||
.map(|s| s.chars().next())
|
||||
.filter(|c| c.is_some())
|
||||
.map(|c| c.unwrap())
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
"special_tokens" => builder.special_tokens(
|
||||
val.cast_as::<PyList>()?
|
||||
.into_iter()
|
||||
|
Reference in New Issue
Block a user