Add WordLevel trainer

This commit is contained in:
Anthony MOI
2020-10-07 16:46:55 -04:00
committed by Anthony MOI
parent a745321aca
commit 059d43b265
5 changed files with 185 additions and 0 deletions

View File

@ -4,4 +4,5 @@ from .. import trainers
Trainer = trainers.Trainer
BpeTrainer = trainers.BpeTrainer
UnigramTrainer = trainers.UnigramTrainer
WordLevelTrainer = trainers.WordLevelTrainer
WordPieceTrainer = trainers.WordPieceTrainer

View File

@ -44,6 +44,7 @@ fn trainers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<trainers::PyTrainer>()?;
m.add_class::<trainers::PyBpeTrainer>()?;
m.add_class::<trainers::PyWordPieceTrainer>()?;
m.add_class::<trainers::PyWordLevelTrainer>()?;
m.add_class::<trainers::PyUnigramTrainer>()?;
Ok(())
}

View File

@ -242,6 +242,69 @@ impl PyWordPieceTrainer {
}
}
/// Capable of training a WorldLevel model
///
/// Args:
/// vocab_size: unsigned int:
/// The size of the final vocabulary, including all tokens and alphabet.
///
/// min_frequency: unsigned int:
/// The minimum frequency a pair should have in order to be merged.
///
/// show_progress: boolean:
/// Whether to show progress bars while training.
///
/// special_tokens: List[Union[str, AddedToken]]:
/// A list of special tokens the model should know of.
///
/// Returns:
/// Trainer
#[pyclass(extends=PyTrainer, name=WordLevelTrainer)]
pub struct PyWordLevelTrainer {}
#[pymethods]
impl PyWordLevelTrainer {
/// Create a new WordLevelTrainer with the given configuration
#[new]
#[args(kwargs = "**")]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut trainer = tk::models::wordlevel::WordLevelTrainer::default();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
"vocab_size" => trainer.vocab_size = val.extract()?,
"min_frequency" => trainer.min_frequency = val.extract()?,
"show_progress" => trainer.show_progress = val.extract()?,
"special_tokens" => {
trainer.special_tokens = val
.cast_as::<PyList>()?
.into_iter()
.map(|token| {
if let Ok(content) = token.extract::<String>() {
Ok(PyAddedToken::from(content, Some(true)).get_token())
} else if let Ok(mut token) =
token.extract::<PyRefMut<PyAddedToken>>()
{
token.is_special_token = true;
Ok(token.get_token())
} else {
Err(exceptions::PyTypeError::new_err(
"special_tokens must be a List[Union[str, AddedToken]]",
))
}
})
.collect::<PyResult<Vec<_>>>()?
}
_ => println!("Ignored unknown kwargs option {}", key),
}
}
}
Ok((PyWordLevelTrainer {}, PyTrainer::new(trainer.into())))
}
}
/// Capable of training a Unigram model
///
/// Args:

View File

@ -8,6 +8,10 @@ use std::io::{BufReader, Read, Write};
use std::path::{Path, PathBuf};
mod serialization;
mod trainer;
// Re-export
pub use trainer::*;
type Vocab = HashMap<String, u32>;

View File

@ -0,0 +1,116 @@
use super::WordLevel;
use crate::{AddedToken, Result, Trainer};
use std::collections::HashMap;
pub struct WordLevelTrainer {
/// The minimum frequency a word must have to be part of the vocabulary
pub min_frequency: u32,
/// The target vocabulary size
pub vocab_size: usize,
/// Whether to show progress while training
pub show_progress: bool,
/// A list of special tokens that the model should know of
pub special_tokens: Vec<AddedToken>,
}
impl Default for WordLevelTrainer {
fn default() -> Self {
Self {
min_frequency: 0,
vocab_size: 30_000,
show_progress: true,
special_tokens: vec![],
}
}
}
impl WordLevelTrainer {
fn train(&self, word_counts: HashMap<String, u32>) -> Result<(WordLevel, Vec<AddedToken>)> {
let mut ordered_counts = word_counts.into_iter().collect::<Vec<_>>();
ordered_counts.sort_by_key(|(_, n)| std::cmp::Reverse(*n));
let word_level = WordLevel::builder()
.vocab(
self.special_tokens
.iter()
.map(|token| token.content.clone())
.chain(
ordered_counts
.into_iter()
.filter(|(_, n)| *n >= self.min_frequency)
.map(|(w, _)| w),
)
.take(self.vocab_size)
.enumerate()
.map(|(i, w)| (w, i as u32))
.collect(),
)
.build();
Ok((word_level, self.special_tokens.clone()))
}
}
impl Trainer for WordLevelTrainer {
type Model = WordLevel;
/// Train a WordLevel model
fn train(&self, word_counts: HashMap<String, u32>) -> Result<(WordLevel, Vec<AddedToken>)> {
self.train(word_counts)
}
/// Whether we should show progress
fn should_show_progress(&self) -> bool {
self.show_progress
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_train() {
let word_counts: HashMap<String, u32> = [
("the".into(), 25),
("roses".into(), 22),
("are".into(), 24),
("red".into(), 12),
("voilets".into(), 10),
("blue".into(), 16),
]
.iter()
.cloned()
.collect();
let mut trainer = WordLevelTrainer::default();
trainer.vocab_size = 5;
let (model, _) = trainer.train(word_counts.clone()).unwrap();
let expected_vocab: HashMap<String, u32> = [
("the".into(), 0),
("are".into(), 1),
("roses".into(), 2),
("blue".into(), 3),
("red".into(), 4),
]
.iter()
.cloned()
.collect();
assert_eq!(model.vocab, expected_vocab);
// If we specify a min_frequency
trainer.min_frequency = 15;
let (model, _) = trainer.train(word_counts).unwrap();
let expected_vocab: HashMap<String, u32> = [
("the".into(), 0),
("are".into(), 1),
("roses".into(), 2),
("blue".into(), 3),
]
.iter()
.cloned()
.collect();
assert_eq!(model.vocab, expected_vocab);
}
}