From 805dc58949f443f17710708cfeb2342faa68c0d8 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Fri, 3 Jan 2020 20:23:58 -0500 Subject: [PATCH] Update training to include new lines --- tokenizers/src/tokenizer/mod.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index ea156767..1e7bb159 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -411,16 +411,24 @@ impl Tokenizer { let mut words = HashMap::new(); let file: std::fs::File = File::open(file)?; - let file = BufReader::new(file); + let mut file = BufReader::new(file); - for line in file.lines() { - let line = line?; - let normalized = self.normalize(&line)?; - let pre_tokenized = self.pre_tokenize(normalized.get())?; - trainer.process_tokens( - &mut words, - pre_tokenized.into_iter().map(|(t, _)| t).collect(), - ); + let mut buf = String::new(); + loop { + buf.clear(); + // We read new lines using this API instead of the Lines Iterator + // on purpose. We want to keep the `\n` and potential `\r` between each lines + match file.read_line(&mut buf)? { + 0 => break, + _ => { + let normalized = self.normalize(&buf)?; + let pre_tokenized = self.pre_tokenize(normalized.get())?; + trainer.process_tokens( + &mut words, + pre_tokenized.into_iter().map(|(t, _)| t).collect(), + ); + } + } } Ok(words)