diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 823f46a6..d15093fb 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -517,6 +517,9 @@ impl NormalizedString { if let Some((b, prev)) = self.normalized.char_indices().last() { let transformations = std::iter::once((prev, 0)).chain(s.chars().map(|c| (c, 1))); self.transform_range(Range::Normalized(b..), transformations, 0); + } else { + let transformations = s.chars().map(|c| (c, 1)); + self.transform_range(Range::Normalized(..), transformations, 0); } self } @@ -2284,4 +2287,24 @@ mod tests { s.lowercase(); assert_eq!(s.get(), "a..."); } + + #[test] + fn test_append_after_clear() { + let mut n = NormalizedString::from("Hello"); + assert_eq!(n.get(), "Hello"); + + n.clear(); + assert_eq!(n.get(), ""); + + n.append(" World"); + assert_eq!(n.get(), " World"); + + assert_eq!(n.len_original(), 5); + assert_eq!(n.len(), 6); + + assert_eq!(n.get_range_original(Range::Original(0..5)), Some("Hello")); + assert_eq!(n.get_range_original(Range::Normalized(0..6)), Some("")); + + assert_eq!(n.get_range(Range::Normalized(0..6)), Some(" World")); + } }