Fix stripping strings containing Unicode characters (#707)

* Strip seems to have been broken for a while on unicode strings. - Includes a failing tests + fixed it. - This function could maybe b optimized, we're scanning the string 3 times now. and once fully for chars. * Update CHANGELOG.md Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
2025-12-12 05:18:39 +00:00 · 2021-05-24 22:49:59 +02:00
parent 4b7f8c2d7c
commit c046da7679
2 changed files with 15 additions and 1 deletions
--- a/bindings/python/CHANGELOG.md
+++ b/bindings/python/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

 ### Fixed
 - [#686]: Fix SPM conversion process for whitespace deduplication
+- [#707]: Fix stripping strings containing Unicode characters

 ### Added
 - [#693]: Add a CTC Decoder for Wave2Vec models
@@ -317,6 +318,7 @@ delimiter (Works like `.split(delimiter)`)
 - Fix a bug that was causing crashes in Python 3.5


+[#707]: https://github.com/huggingface/tokenizers/pull/707
 [#693]: https://github.com/huggingface/tokenizers/pull/693
 [#686]: https://github.com/huggingface/tokenizers/pull/686
 [#674]: https://github.com/huggingface/tokenizers/pull/674
--- a/tokenizers/src/tokenizer/normalizer.rs
+++ b/tokenizers/src/tokenizer/normalizer.rs
@@ -736,12 +736,13 @@ impl NormalizedString {
        };

        if leading_spaces > 0 || trailing_spaces > 0 {
+            let count = self.get().chars().count();
            let transformation = self
                .normalized
                .chars()
                .enumerate()
                .filter_map(|(i, c)| {
-                    if i < leading_spaces || i >= self.len() - trailing_spaces {
+                    if i < leading_spaces || i >= count - trailing_spaces {
                        None
                    } else if i == self.len() - trailing_spaces - 1 {
                        Some((c, -(trailing_spaces as isize)))
@@ -1274,6 +1275,17 @@ mod tests {
        );
    }

+    #[test]
+    fn strip_unicode() {
+        let mut n = NormalizedString::from("  你好asa \n");
+        n.strip();
+        assert_eq!(&n.normalized, "你好asa");
+        assert_eq!(
+            n.get_range_original(Range::Normalized(0..n.normalized.len())),
+            Some("你好asa")
+        );
+    }
+
    #[test]
    fn prepend() {
        let mut n = NormalizedString::from("there");