Update training to include new lines

This commit is contained in:
Anthony MOI
2020-01-03 20:23:58 -05:00
parent a1891387ed
commit 805dc58949

View File

@ -411,17 +411,25 @@ impl Tokenizer {
let mut words = HashMap::new(); let mut words = HashMap::new();
let file: std::fs::File = File::open(file)?; let file: std::fs::File = File::open(file)?;
let file = BufReader::new(file); let mut file = BufReader::new(file);
for line in file.lines() { let mut buf = String::new();
let line = line?; loop {
let normalized = self.normalize(&line)?; buf.clear();
// We read new lines using this API instead of the Lines Iterator
// on purpose. We want to keep the `\n` and potential `\r` between each lines
match file.read_line(&mut buf)? {
0 => break,
_ => {
let normalized = self.normalize(&buf)?;
let pre_tokenized = self.pre_tokenize(normalized.get())?; let pre_tokenized = self.pre_tokenize(normalized.get())?;
trainer.process_tokens( trainer.process_tokens(
&mut words, &mut words,
pre_tokenized.into_iter().map(|(t, _)| t).collect(), pre_tokenized.into_iter().map(|(t, _)| t).collect(),
); );
} }
}
}
Ok(words) Ok(words)
}) })