diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index d81d3802..ea1b4954 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -421,8 +421,11 @@ class Split(PreTokenizer): Args: pattern (:obj:`str` or :class:`~tokenizers.Regex`): - A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` - + A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. + If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, + otherwise we consider is as a string pattern. For example `pattern="|"` + means you want to split on `|` (imagine a csv file for example), while + `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. behavior (:class:`~tokenizers.SplitDelimiterBehavior`): The behavior to use when splitting. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 4b97319d..1873222e 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -334,8 +334,11 @@ impl PyWhitespaceSplit { /// /// Args: /// pattern (:obj:`str` or :class:`~tokenizers.Regex`): -/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` -/// +/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. +/// If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, +/// otherwise we consider is as a string pattern. For example `pattern="|"` +/// means you want to split on `|` (imagine a csv file for example), while +/// `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. /// behavior (:class:`~tokenizers.SplitDelimiterBehavior`): /// The behavior to use when splitting. /// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",