Update training to include new lines

2025-08-23 00:35:35 +00:00 · 2020-01-03 20:23:58 -05:00
parent a1891387ed
commit 805dc58949
1 changed files with 17 additions and 9 deletions
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@ -411,17 +411,25 @@ impl Tokenizer {
                let mut words = HashMap::new();
                let file: std::fs::File = File::open(file)?;
-                let file = BufReader::new(file);
+                let mut file = BufReader::new(file);
-                for line in file.lines() {
+                let mut buf = String::new();
-                    let line = line?;
+                loop {
-                    let normalized = self.normalize(&line)?;
+                    buf.clear();
                    // We read new lines using this API instead of the Lines Iterator
                    // on purpose. We want to keep the `\n` and potential `\r` between each lines
                    match file.read_line(&mut buf)? {
                        0 => break,
                        _ => {
                            let normalized = self.normalize(&buf)?;
                            let pre_tokenized = self.pre_tokenize(normalized.get())?;
                            trainer.process_tokens(
                                &mut words,
                                pre_tokenized.into_iter().map(|(t, _)| t).collect(),
                            );
                        }
                    }
                }
                Ok(words)
            })