diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 1585da76..9cdd6213 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -462,7 +462,11 @@ impl BPE { fn tokenize_with_cache(&self, sequence: &str) -> Result> { if self.ignore_merges { if let Some(id) = self.vocab.get(sequence) { - return Ok(vec![Token::new(*id, sequence.to_string().clone(), (0, 0))]); + return Ok(vec![Token::new( + *id, + sequence.to_string().clone(), + (0, sequence.len()), + )]); } } if let Some(ref hit) = self.cache.as_ref().and_then(|c| c.get(sequence)) { @@ -941,10 +945,13 @@ mod tests { .build() .unwrap(); let tokens = bpe.tokenize(".:.:").unwrap(); - assert_eq!(tokens, vec![Token::new(0u32, ".:.:".into(), (0, 0))]); + assert_eq!(tokens, vec![Token::new(0u32, ".:.:".into(), (0, 4))]); let tokens = bpe.tokenize("Ġbelirtilen").unwrap(); - assert_eq!(tokens, vec![Token::new(1u32, "Ġbelirtilen".into(), (0, 0))]); + assert_eq!( + tokens, + vec![Token::new(1u32, "Ġbelirtilen".into(), (0, 12))] + ); bpe.ignore_merges = false; diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index d92e04a1..49bc539a 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -1181,11 +1181,10 @@ where }; trainer.feed( - sequences.map(|s| { + sequences.inspect(|s| { if let Some(progress) = &progress { progress.inc(s.len() as u64) } - s }), |seq| { let normalized = self.do_normalize(seq.as_ref())?; @@ -1233,11 +1232,10 @@ where }; trainer.feed( - sequences.map(|s| { + sequences.inspect(|_s| { if let Some(progress) = &progress { progress.inc(1) } - s }), |seq| { let normalized = self.do_normalize(seq.as_ref())?;