From eea8e1ae6fc4a281513a536d8d4e156ef3b8fb2f Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 7 Aug 2024 12:35:01 +0200 Subject: [PATCH] Fix doc about split (#1591) * update doc * add example * Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry * stub --------- Co-authored-by: Nicolas Patry --- .../python/py_src/tokenizers/pre_tokenizers/__init__.pyi | 7 +++++-- bindings/python/src/pre_tokenizers.rs | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index d81d3802..ea1b4954 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -421,8 +421,11 @@ class Split(PreTokenizer): Args: pattern (:obj:`str` or :class:`~tokenizers.Regex`): - A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` - + A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. + If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, + otherwise we consider is as a string pattern. For example `pattern="|"` + means you want to split on `|` (imagine a csv file for example), while + `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. behavior (:class:`~tokenizers.SplitDelimiterBehavior`): The behavior to use when splitting. Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 4b97319d..1873222e 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -334,8 +334,11 @@ impl PyWhitespaceSplit { /// /// Args: /// pattern (:obj:`str` or :class:`~tokenizers.Regex`): -/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` -/// +/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`. +/// If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`, +/// otherwise we consider is as a string pattern. For example `pattern="|"` +/// means you want to split on `|` (imagine a csv file for example), while +/// `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'. /// behavior (:class:`~tokenizers.SplitDelimiterBehavior`): /// The behavior to use when splitting. /// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",