Python - Improve normalizers docs

2025-08-23 00:35:35 +00:00 · 2020-11-20 16:26:50 -05:00
parent c01c301743
commit 5842b3db73
2 changed files with 416 additions and 128 deletions
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@ -109,13 +109,35 @@ impl PyNormalizer {
        }
    }

-    /// Normalize the given NormalizedString in-place
+    /// Normalize a :class:`~tokenizers.NormalizedString` in-place
+    ///
+    /// This method allows to modify a :class:`~tokenizers.NormalizedString` to
+    /// keep track of the alignment information. If you just want to see the result
+    /// of the normalization on a raw string, you can use
+    /// :meth:`~tokenizers.normalizers.Normalizer.normalize_str`
+    ///
+    /// Args:
+    ///     normalized (:class:`~tokenizers.NormalizedString`):
+    ///         The normalized string on which to apply this
+    ///         :class:`~tokenizers.normalizers.Normalizer`
    #[text_signature = "(self, normalized)"]
    fn normalize(&self, normalized: &mut PyNormalizedString) -> PyResult<()> {
        ToPyResult(self.normalizer.normalize(&mut normalized.normalized)).into()
    }

-    /// Normalize the given str
+    /// Normalize the given string
+    ///
+    /// This method provides a way to visualize the effect of a
+    /// :class:`~tokenizers.normalizers.Normalizer` but it does not keep track of the alignment
+    /// information. If you need to get/convert offsets, you can use
+    /// :meth:`~tokenizers.normalizers.Normalizer.normalize`
+    ///
+    /// Args:
+    ///     sequence (:obj:`str`):
+    ///         A string to normalize
+    ///
+    /// Returns:
+    ///     :obj:`str`: A string after normalization
    #[text_signature = "(self, sequence)"]
    fn normalize_str(&self, sequence: &str) -> PyResult<String> {
        let mut normalized = NormalizedString::from(sequence);
@ -130,47 +152,37 @@ impl PyNormalizer {
 /// This includes cleaning the text, handling accents, chinese chars and lowercasing
 ///
 /// Args:
-///     clean_text: (`optional`) boolean:
+///     clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
 ///         Whether to clean the text, by removing any control characters
 ///         and replacing all whitespaces by the classic one.
 ///
-///     handle_chinese_chars: (`optional`) boolean:
+///     handle_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
 ///         Whether to handle chinese chars by putting spaces around them.
 ///
-///     strip_accents: (`optional`) boolean:
+///     strip_accents (:obj:`bool`, `optional`):
 ///         Whether to strip all accents. If this option is not specified (ie == None),
 ///         then it will be determined by the value for `lowercase` (as in the original Bert).
 ///
-///     lowercase: (`optional`) boolean:
+///     lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
 ///         Whether to lowercase.
-///
-/// Returns:
-///     Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
 #[text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"]
 pub struct PyBertNormalizer {}
 #[pymethods]
 impl PyBertNormalizer {
    #[new]
-    #[args(kwargs = "**")]
-    fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyNormalizer)> {
-        let mut clean_text = true;
-        let mut handle_chinese_chars = true;
-        let mut strip_accents = None;
-        let mut lowercase = true;
-
-        if let Some(kwargs) = kwargs {
-            for (key, value) in kwargs {
-                let key: &str = key.extract()?;
-                match key {
-                    "clean_text" => clean_text = value.extract()?,
-                    "handle_chinese_chars" => handle_chinese_chars = value.extract()?,
-                    "strip_accents" => strip_accents = value.extract()?,
-                    "lowercase" => lowercase = value.extract()?,
-                    _ => println!("Ignored unknown kwargs option {}", key),
-                }
-            }
-        }
+    #[args(
+        clean_text = "true",
+        handle_chinese_chars = "true",
+        strip_accents = "None",
+        lowercase = "true"
+    )]
+    fn new(
+        clean_text: bool,
+        handle_chinese_chars: bool,
+        strip_accents: Option<bool>,
+        lowercase: bool,
+    ) -> PyResult<(Self, PyNormalizer)> {
        let normalizer =
            BertNormalizer::new(clean_text, handle_chinese_chars, strip_accents, lowercase);
        Ok((PyBertNormalizer {}, normalizer.into()))
@ -229,7 +241,7 @@ impl PyNFKC {
 /// All the normalizers run in sequence in the given order
 ///
 /// Args:
-///     normalizers: List[Normalizer]:
+///     normalizers (:obj:`List[Normalizer]`):
 ///         A list of Normalizer to be run as a sequence
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)]
 pub struct PySequence {}
@ -275,24 +287,13 @@ pub struct PyStrip {}
 #[pymethods]
 impl PyStrip {
    #[new]
-    #[args(kwargs = "**")]
-    fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyNormalizer)> {
-        let mut left = true;
-        let mut right = true;
-
-        if let Some(kwargs) = kwargs {
-            if let Some(l) = kwargs.get_item("left") {
-                left = l.extract()?;
-            }
-            if let Some(r) = kwargs.get_item("right") {
-                right = r.extract()?;
-            }
-        }
-
+    #[args(left = "true", right = "true")]
+    fn new(left: bool, right: bool) -> PyResult<(Self, PyNormalizer)> {
        Ok((PyStrip {}, Strip::new(left, right).into()))
    }
 }

+/// StripAccents normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
 #[text_signature = "(self)"]
 pub struct PyStripAccents {}
@ -304,6 +305,57 @@ impl PyStripAccents {
    }
 }

+/// Nmt normalizer
+#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)]
+#[text_signature = "(self)"]
+pub struct PyNmt {}
+#[pymethods]
+impl PyNmt {
+    #[new]
+    fn new() -> PyResult<(Self, PyNormalizer)> {
+        Ok((PyNmt {}, Nmt.into()))
+    }
+}
+
+/// Precompiled normalizer
+/// Don't use manually it is used for compatiblity for SentencePiece.
+#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)]
+#[text_signature = "(self, precompiled_charsmap)"]
+pub struct PyPrecompiled {}
+#[pymethods]
+impl PyPrecompiled {
+    #[new]
+    fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> {
+        let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?;
+        Ok((
+            PyPrecompiled {},
+            Precompiled::from(precompiled_charsmap)
+                .map_err(|e| {
+                    exceptions::PyException::new_err(format!(
+                        "Error while attempting to build Precompiled normalizer: {}",
+                        e
+                    ))
+                })?
+                .into(),
+        ))
+    }
+}
+
+/// Replace normalizer
+#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
+#[text_signature = "(self, pattern, content)"]
+pub struct PyReplace {}
+#[pymethods]
+impl PyReplace {
+    #[new]
+    fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyNormalizer)> {
+        Ok((
+            PyReplace {},
+            ToPyResult(Replace::new(pattern, content)).into_py()?.into(),
+        ))
+    }
+}
+
 #[derive(Clone)]
 pub(crate) struct CustomNormalizer {
    inner: PyObject,
@ -439,57 +491,6 @@ impl Normalizer for PyNormalizerWrapper {
    }
 }

-/// Nmt normalizer
-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)]
-#[text_signature = "(self)"]
-pub struct PyNmt {}
-#[pymethods]
-impl PyNmt {
-    #[new]
-    fn new() -> PyResult<(Self, PyNormalizer)> {
-        Ok((PyNmt {}, Nmt.into()))
-    }
-}
-
-/// Precompiled normalizer
-/// Don't use manually it is used for compatiblity for SentencePiece.
-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)]
-#[text_signature = "(self, precompiled_charsmap)"]
-pub struct PyPrecompiled {}
-#[pymethods]
-impl PyPrecompiled {
-    #[new]
-    fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> {
-        let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?;
-        Ok((
-            PyPrecompiled {},
-            Precompiled::from(precompiled_charsmap)
-                .map_err(|e| {
-                    exceptions::PyException::new_err(format!(
-                        "Error while attempting to build Precompiled normalizer: {}",
-                        e
-                    ))
-                })?
-                .into(),
-        ))
-    }
-}
-
-/// Replace normalizer
-#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
-#[text_signature = "(self, pattern, content)"]
-pub struct PyReplace {}
-#[pymethods]
-impl PyReplace {
-    #[new]
-    fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyNormalizer)> {
-        Ok((
-            PyReplace {},
-            ToPyResult(Replace::new(pattern, content)).into_py()?.into(),
-        ))
-    }
-}
-
 #[cfg(test)]
 mod test {
    use pyo3::prelude::*;