From eea8e1ae6fc4a281513a536d8d4e156ef3b8fb2f Mon Sep 17 00:00:00 2001
From: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Date: Wed, 7 Aug 2024 12:35:01 +0200
Subject: [PATCH] Fix doc about split (#1591)

* update doc

* add example

* Update bindings/python/src/pre_tokenizers.rs

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>

* stub

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
---
 .../python/py_src/tokenizers/pre_tokenizers/__init__.pyi   | 7 +++++--
 bindings/python/src/pre_tokenizers.rs                      | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
index d81d3802..ea1b4954 100644
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
@@ -421,8 +421,11 @@ class Split(PreTokenizer):
 
     Args:
         pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
-
+            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+            If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+            otherwise we consider is as a string pattern. For example `pattern="|"`
+            means you want to split on `|` (imagine a csv file for example), while
+            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
         behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
             The behavior to use when splitting.
             Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
index 4b97319d..1873222e 100644
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -334,8 +334,11 @@ impl PyWhitespaceSplit {
 ///
 /// Args:
 ///     pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
-///
+///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+///         If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
+///         otherwise we consider is as a string pattern. For example `pattern="|"`
+///         means you want to split on `|` (imagine a csv file for example), while
+///         `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
 ///     behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
 ///         The behavior to use when splitting.
 ///         Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",