diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 7865ebda..fab9aaaf 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -247,8 +247,7 @@ impl Tokenizer { // If this is one of our added tokens, lets return an encoding directly if let Some(id) = id { return Ok(Encoding::new( - sentence.clone(), - sentence.clone(), + NormalizedString::from(&sentence), vec![id], vec![type_id], vec![sentence.to_owned()], @@ -260,13 +259,10 @@ impl Tokenizer { } // 1. Normalization - // TODO: Make sure we have the offsets update necessary to go from the original text to - // the normalized one - let original = sentence.clone(); - let normalized = self.normalize(sentence)?; + let normalized = self.normalize(&sentence)?; // 2. Pre tokenization - let pre_tokenized = self.pre_tokenize(&normalized)?; + let pre_tokenized = self.pre_tokenize(&normalized.get())?; // 3. Model let output = self.model.tokenize(pre_tokenized)?; @@ -287,7 +283,6 @@ impl Tokenizer { ); Ok(Encoding::new( - original, normalized, ids, vec![type_id; length], @@ -395,9 +390,12 @@ impl Tokenizer { for line in file.lines() { let line = line?; - let normalized = self.normalize(line)?; - let pre_tokenized = self.pre_tokenize(&normalized)?; - trainer.process_tokens(&mut words, pre_tokenized); + let normalized = self.normalize(&line)?; + let pre_tokenized = self.pre_tokenize(normalized.get())?; + trainer.process_tokens( + &mut words, + pre_tokenized.into_iter().map(|(t, _)| t).collect(), + ); } Ok(words) @@ -420,20 +418,22 @@ impl Tokenizer { } /// PreTokenization logic, handling the case where there is no PreTokenizer set - fn pre_tokenize(&self, sentence: &str) -> Result> { + fn pre_tokenize(&self, sentence: &str) -> Result> { match &self.pre_tokenizer { - None => Ok(vec![sentence.to_owned()]), + None => Ok(vec![(sentence.to_owned(), (0, sentence.len()))]), Some(pre_tokenizer) => pre_tokenizer.pre_tokenize(sentence), } } /// Normalization logic, go through all normalizers - fn normalize(&self, sentence: String) -> Result { + fn normalize(&self, sequence: &str) -> Result { + let mut normalized = NormalizedString::from(sequence); + if let Some(normalizer) = &self.normalizer { - normalizer.normalize(sentence) - } else { - Ok(sentence) + normalizer.normalize(&mut normalized); } + + Ok(normalized) } /// Post processing logic, handling the case where there is no PostProcessor set