Adding a new tests for PreTokenizer.custom.

This example is more illustrative of what's doable for custom PreTokenizer.
2025-12-03 11:18:29 +00:00 · 2020-10-15 10:17:06 +02:00
parent 2fc0edda01
commit 2ccd16bf5c
1 changed files with 48 additions and 0 deletions
--- a/bindings/python/tests/bindings/test_pre_tokenizers.py
+++ b/bindings/python/tests/bindings/test_pre_tokenizers.py
@@ -148,3 +148,51 @@ class TestCustomPreTokenizer:
            ("Hey there!", (0, 10)),
            ("Hey there!", (0, 10)),
        ]
+
+    def test_camel_case(self):
+        class CamelCasePretok:
+            def get_state(self, c):
+                if c.islower():
+                    return "lower"
+                elif c.isupper():
+                    return "upper"
+                elif c.isdigit():
+                    return "digit"
+                else:
+                    return "rest"
+
+            def split(self, n, normalized):
+                i = 0
+                # states = {"any", "lower", "upper", "digit", "rest"}
+                state = "any"
+                pieces = []
+                for j, c in enumerate(normalized.normalized):
+                    c_state = self.get_state(c)
+                    if state == "any":
+                        state = c_state
+                    if state != "rest" and state == c_state:
+                        pass
+                    elif state == "upper" and c_state == "lower":
+                        pass
+                    else:
+                        pieces.append(normalized[i:j])
+                        i = j
+                    state = c_state
+                pieces.append(normalized[i:])
+                return pieces
+
+            def pre_tokenize(self, pretok):
+                pretok.split(self.split)
+
+        camel = PreTokenizer.custom(CamelCasePretok())
+
+        assert camel.pre_tokenize_str("HeyThere!?-ThisIsLife") == [
+            ("Hey", (0, 3)),
+            ("There", (3, 8)),
+            ("!", (8, 9)),
+            ("?", (9, 10)),
+            ("-", (10, 11)),
+            ("This", (11, 15)),
+            ("Is", (15, 17)),
+            ("Life", (17, 21)),
+        ]