Fix doc about split (#1591)

* update doc * add example * Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> * stub --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2025-12-03 03:08:21 +00:00 · 2024-08-07 12:35:01 +02:00
parent 6a5fce9fa0
commit eea8e1ae6f
2 changed files with 10 additions and 4 deletions
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -421,8 +421,11 @@ class Split(PreTokenizer):

    Args:
        pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
-
+            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+            If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+            otherwise we consider is as a string pattern. For example `pattern="|"`
+            means you want to split on `|` (imagine a csv file for example), while
+            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
        behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
            The behavior to use when splitting.
            Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -334,8 +334,11 @@ impl PyWhitespaceSplit {
 ///
 /// Args:
 ///     pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
-///
+///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+///         If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+///         otherwise we consider is as a string pattern. For example `pattern="|"`
+///         means you want to split on `|` (imagine a csv file for example), while
+///         `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
 ///     behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
 ///         The behavior to use when splitting.
 ///         Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",