Split Pre-Tokenizer (#542)

* start playing around * make a first version * refactor * apply make format * add python bindings * add some python binding tests * correct pre-tokenizers * update auto-generated bindings * lint python bindings * add code node * add split to docs * refactor python binding a bit * cargo fmt * clippy and fmt in node * quick updates and fixes * Oops * Update node typings * Update changelog Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
2025-12-08 05:38:23 +00:00 · 2020-11-27 23:07:03 +01:00
parent 58e1d8de67
commit dd399d2ad0
17 changed files with 494 additions and 2 deletions
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.py
@@ -9,6 +9,7 @@ Digits = pre_tokenizers.Digits
 Metaspace = pre_tokenizers.Metaspace
 Punctuation = pre_tokenizers.Punctuation
 Sequence = pre_tokenizers.Sequence
+Split = pre_tokenizers.Split
 UnicodeScripts = pre_tokenizers.UnicodeScripts
 Whitespace = pre_tokenizers.Whitespace
 WhitespaceSplit = pre_tokenizers.WhitespaceSplit
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -392,6 +392,40 @@ class Sequence(PreTokenizer):
        """
        pass

+class Split(PreTokenizer):
+    """
+    Split PreTokenizer
+
+    This versatile pre-tokenizer splits using the provided pattern and
+    according to the provided behavior. The pattern can be inverted by
+    making use of the invert flag.
+
+    Args:
+        pattern (:obj:`str` or :class:`~tokenizers.Regex`):
+            A pattern used to split the string. Usually a string or a Regex
+
+        behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+            The behavior to use when splitting.
+            Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
+            "contiguous"
+
+        invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to invert the pattern.
+    """
+
+    def __init__(self, pattern, behavior, invert=False):
+        pass
+    def pre_tokenize(self, pretok):
+        """
+        Pre tokenize the given PreTokenizedString in-place
+        """
+        pass
+    def pre_tokenize_str(self, sequence):
+        """
+        Pre tokenize the given sequence
+        """
+        pass
+
 class UnicodeScripts(PreTokenizer):
    """
    This pre-tokenizer splits on characters that belong to different language family