Automatically stubbing the pyi files while keeping inspecting ability (#509)

* First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
2025-08-23 00:35:35 +00:00 · 2020-11-17 21:13:00 +01:00
parent fff856cff7
commit 352c92ad33
25 changed files with 2511 additions and 1426 deletions
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@ -15,6 +15,10 @@ use tk::normalizers::{
 use tk::{NormalizedString, Normalizer};
 use tokenizers as tk;

+/// Base class for all normalizers
+///
+/// This class is not supposed to be instantiated directly. Instead, any implementation of a
+/// Normalizer will return an instance of this class when instantiated.
 #[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)]
 #[derive(Clone, Serialize, Deserialize)]
 pub struct PyNormalizer {
@ -105,10 +109,14 @@ impl PyNormalizer {
        }
    }

+    /// Normalize the given NormalizedString in-place
+    #[text_signature = "(self, normalized)"]
    fn normalize(&self, normalized: &mut PyNormalizedString) -> PyResult<()> {
        ToPyResult(self.normalizer.normalize(&mut normalized.normalized)).into()
    }

+    /// Normalize the given str
+    #[text_signature = "(self, sequence)"]
    fn normalize_str(&self, sequence: &str) -> PyResult<String> {
        let mut normalized = NormalizedString::from(sequence);
        ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?;
@ -116,7 +124,30 @@ impl PyNormalizer {
    }
 }

+/// BertNormalizer
+///
+/// Takes care of normalizing raw text before giving it to a Bert model.
+/// This includes cleaning the text, handling accents, chinese chars and lowercasing
+///
+/// Args:
+///     clean_text: (`optional`) boolean:
+///         Whether to clean the text, by removing any control characters
+///         and replacing all whitespaces by the classic one.
+///
+///     handle_chinese_chars: (`optional`) boolean:
+///         Whether to handle chinese chars by putting spaces around them.
+///
+///     strip_accents: (`optional`) boolean:
+///         Whether to strip all accents. If this option is not specified (ie == None),
+///         then it will be determined by the value for `lowercase` (as in the original Bert).
+///
+///     lowercase: (`optional`) boolean:
+///         Whether to lowercase.
+///
+/// Returns:
+///     Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
+#[text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"]
 pub struct PyBertNormalizer {}
 #[pymethods]
 impl PyBertNormalizer {
@ -146,7 +177,9 @@ impl PyBertNormalizer {
    }
 }

+/// NFD Unicode Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFD)]
+#[text_signature = "(self)"]
 pub struct PyNFD {}
 #[pymethods]
 impl PyNFD {
@ -156,7 +189,9 @@ impl PyNFD {
    }
 }

+/// NFKD Unicode Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKD)]
+#[text_signature = "(self)"]
 pub struct PyNFKD {}
 #[pymethods]
 impl PyNFKD {
@ -166,7 +201,9 @@ impl PyNFKD {
    }
 }

+/// NFC Unicode Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFC)]
+#[text_signature = "(self)"]
 pub struct PyNFC {}
 #[pymethods]
 impl PyNFC {
@ -176,7 +213,9 @@ impl PyNFC {
    }
 }

+/// NFKC Unicode Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKC)]
+#[text_signature = "(self)"]
 pub struct PyNFKC {}
 #[pymethods]
 impl PyNFKC {
@ -186,6 +225,12 @@ impl PyNFKC {
    }
 }

+/// Allows concatenating multiple other Normalizer as a Sequence.
+/// All the normalizers run in sequence in the given order
+///
+/// Args:
+///     normalizers: List[Normalizer]:
+///         A list of Normalizer to be run as a sequence
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)]
 pub struct PySequence {}
 #[pymethods]
@ -211,7 +256,9 @@ impl PySequence {
    }
 }

+/// Lowercase Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Lowercase)]
+#[text_signature = "(self)"]
 pub struct PyLowercase {}
 #[pymethods]
 impl PyLowercase {
@ -221,7 +268,9 @@ impl PyLowercase {
    }
 }

+/// Strip normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Strip)]
+#[text_signature = "(self, left=True, right=True)"]
 pub struct PyStrip {}
 #[pymethods]
 impl PyStrip {
@ -245,6 +294,7 @@ impl PyStrip {
 }

 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
+#[text_signature = "(self)"]
 pub struct PyStripAccents {}
 #[pymethods]
 impl PyStripAccents {
@ -389,7 +439,9 @@ impl Normalizer for PyNormalizerWrapper {
    }
 }

+/// Nmt normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)]
+#[text_signature = "(self)"]
 pub struct PyNmt {}
 #[pymethods]
 impl PyNmt {
@ -399,7 +451,10 @@ impl PyNmt {
    }
 }

+/// Precompiled normalizer
+/// Don't use manually it is used for compatiblity for SentencePiece.
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)]
+#[text_signature = "(self, precompiled_charsmap)"]
 pub struct PyPrecompiled {}
 #[pymethods]
 impl PyPrecompiled {
@ -420,7 +475,9 @@ impl PyPrecompiled {
    }
 }

+/// Replace normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
+#[text_signature = "(self, pattern, content)"]
 pub struct PyReplace {}
 #[pymethods]
 impl PyReplace {