From cb8d4de5994d825ea4260613888410ce17f605bf Mon Sep 17 00:00:00 2001 From: Chris Ha Date: Wed, 7 Jun 2023 16:41:28 +0900 Subject: [PATCH] fix documentation regarding regex (#1264) * fix documentation regarding regex Split() in pre_tokenizers.rs and normalizations take a regex that is required to be built with a tokenizer specific regex module. Clarify this in the documentation. * Update __init__.pyi fixed __init__.pyi * Update bindings/python/py_src/tokenizers/__init__.pyi Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update bindings/python/py_src/tokenizers/__init__.pyi Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Revert "Update bindings/python/py_src/tokenizers/__init__.pyi" This reverts commit 6e8bdfcddf67bcdd8e3b1a78685fd5ef8f6a153c. * Revert "Update bindings/python/py_src/tokenizers/__init__.pyi" This reverts commit 897b0c0de471ad7cb6269b8456347c4e5cff2aaf. * Revert "Update __init__.pyi" This reverts commit fbe82310b7728ee7cdb6f8b38fbc2388f9d95771. * add codeblocks the right way * add codeblocks with stub.py ran setup.py install to build, and then ran stub.py --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- bindings/python/py_src/tokenizers/__init__.pyi | 2 +- bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi | 2 +- bindings/python/src/pre_tokenizers.rs | 2 +- bindings/python/src/utils/normalization.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index cbf5624d..bc026c4e 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -494,7 +494,7 @@ class NormalizedString: Args: pattern: Pattern: - A pattern used to split the string. Usually a string or a Regex + A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex` behavior: SplitDelimiterBehavior: The behavior to use when splitting. diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index 020559ad..e3cb84dd 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -411,7 +411,7 @@ class Split(PreTokenizer): Args: pattern (:obj:`str` or :class:`~tokenizers.Regex`): - A pattern used to split the string. Usually a string or a Regex + A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` behavior (:class:`~tokenizers.SplitDelimiterBehavior`): The behavior to use when splitting. diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 18af23d5..244efdc5 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -325,7 +325,7 @@ impl PyWhitespaceSplit { /// /// Args: /// pattern (:obj:`str` or :class:`~tokenizers.Regex`): -/// A pattern used to split the string. Usually a string or a Regex +/// A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex` /// /// behavior (:class:`~tokenizers.SplitDelimiterBehavior`): /// The behavior to use when splitting. diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index 4bcd50fb..af696c63 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -318,7 +318,7 @@ impl PyNormalizedString { /// /// Args: /// pattern: Pattern: - /// A pattern used to split the string. Usually a string or a Regex + /// A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex` /// /// behavior: SplitDelimiterBehavior: /// The behavior to use when splitting.