From 66b6211705f65947d62ea313b938302d97b2928a Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Thu, 2 Jan 2020 14:16:14 -0500 Subject: [PATCH] NormalizedString - Fix added chars at beginning --- tokenizers/src/tokenizer/normalizer.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 39398201..6a46c813 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -91,10 +91,10 @@ impl NormalizedString { // This is a newly inserted character, so we use the alignment from the // previous one Ordering::Greater => { + offset += 1; if idx < 1 { Some((0, 0)) } else { - offset += 1; self.alignments.get(idx - 1).copied() } } @@ -330,4 +330,25 @@ mod tests { assert_eq!(world_n, "world"); assert_eq!(world_o, "World"); } + + #[test] + fn added_around_edges() { + let mut n = NormalizedString::from("Hello"); + n.transform( + vec![ + (' ', 1), + ('H', 0), + ('e', 0), + ('l', 0), + ('l', 0), + ('o', 0), + (' ', 1), + ] + .into_iter(), + 0, + ); + + assert_eq!(&n.normalized, " Hello "); + assert_eq!(n.get_range_original(0..n.normalized.len()), Some("Hello")); + } }