fix documentation regarding regex (#1264)

* fix documentation regarding regex Split() in pre_tokenizers.rs and normalizations take a regex that is required to be built with a tokenizer specific regex module. Clarify this in the documentation. * Update __init__.pyi fixed __init__.pyi * Update bindings/python/py_src/tokenizers/__init__.pyi Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update bindings/python/py_src/tokenizers/__init__.pyi Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Revert "Update bindings/python/py_src/tokenizers/__init__.pyi" This reverts commit 6e8bdfcddf67bcdd8e3b1a78685fd5ef8f6a153c. * Revert "Update bindings/python/py_src/tokenizers/__init__.pyi" This reverts commit 897b0c0de471ad7cb6269b8456347c4e5cff2aaf. * Revert "Update __init__.pyi" This reverts commit fbe82310b7728ee7cdb6f8b38fbc2388f9d95771. * add codeblocks the right way * add codeblocks with stub.py ran setup.py install to build, and then ran stub.py --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2025-08-22 16:25:30 +00:00 · 2023-06-07 16:41:28 +09:00
parent c7102c4c0f
commit cb8d4de599
4 changed files with 4 additions and 4 deletions
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@ -494,7 +494,7 @@ class NormalizedString:

        Args:
            pattern: Pattern:
-                A pattern used to split the string. Usually a string or a Regex
+                A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`

            behavior: SplitDelimiterBehavior:
                The behavior to use when splitting.
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@ -411,7 +411,7 @@ class Split(PreTokenizer):

    Args:
        pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-            A pattern used to split the string. Usually a string or a Regex
+            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`

        behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
            The behavior to use when splitting.
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@ -325,7 +325,7 @@ impl PyWhitespaceSplit {
 ///
 /// Args:
 ///     pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-///         A pattern used to split the string. Usually a string or a Regex
+///         A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`
 ///
 ///     behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
 ///         The behavior to use when splitting.
--- a/bindings/python/src/utils/normalization.rs
+++ b/bindings/python/src/utils/normalization.rs
@ -318,7 +318,7 @@ impl PyNormalizedString {
    ///
    /// Args:
    ///     pattern: Pattern:
-    ///         A pattern used to split the string. Usually a string or a Regex
+    ///         A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`
    ///
    ///     behavior: SplitDelimiterBehavior:
    ///         The behavior to use when splitting.