mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Update training to include new lines
This commit is contained in:
@ -411,17 +411,25 @@ impl Tokenizer {
|
|||||||
let mut words = HashMap::new();
|
let mut words = HashMap::new();
|
||||||
|
|
||||||
let file: std::fs::File = File::open(file)?;
|
let file: std::fs::File = File::open(file)?;
|
||||||
let file = BufReader::new(file);
|
let mut file = BufReader::new(file);
|
||||||
|
|
||||||
for line in file.lines() {
|
let mut buf = String::new();
|
||||||
let line = line?;
|
loop {
|
||||||
let normalized = self.normalize(&line)?;
|
buf.clear();
|
||||||
|
// We read new lines using this API instead of the Lines Iterator
|
||||||
|
// on purpose. We want to keep the `\n` and potential `\r` between each lines
|
||||||
|
match file.read_line(&mut buf)? {
|
||||||
|
0 => break,
|
||||||
|
_ => {
|
||||||
|
let normalized = self.normalize(&buf)?;
|
||||||
let pre_tokenized = self.pre_tokenize(normalized.get())?;
|
let pre_tokenized = self.pre_tokenize(normalized.get())?;
|
||||||
trainer.process_tokens(
|
trainer.process_tokens(
|
||||||
&mut words,
|
&mut words,
|
||||||
pre_tokenized.into_iter().map(|(t, _)| t).collect(),
|
pre_tokenized.into_iter().map(|(t, _)| t).collect(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(words)
|
Ok(words)
|
||||||
})
|
})
|
||||||
|
Reference in New Issue
Block a user