Python - Add NormalizedString + doc/typings

2025-12-07 21:28:19 +00:00 · 2020-01-06 17:55:22 -05:00
parent 6de04bbaea
commit 0079a7a6b7
4 changed files with 93 additions and 7 deletions
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@@ -1,6 +1,7 @@
 extern crate tokenizers as tk;

 use crate::error::PyError;
+use crate::normalized_string::NormalizedString;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use tk::tokenizer::PaddingDirection;
@@ -20,13 +21,8 @@ impl Encoding {
 #[pymethods]
 impl Encoding {
    #[getter]
-    fn get_original(&self) -> String {
-        self.encoding.get_normalized().get_original().to_owned()
-    }
-
-    #[getter]
-    fn get_normalized(&self) -> String {
-        self.encoding.get_normalized().get().to_owned()
+    fn get_normalized(&self) -> NormalizedString {
+        NormalizedString::new(self.encoding.get_normalized().clone())
    }

    #[args(kwargs = "**")]
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -2,6 +2,7 @@ mod decoders;
 mod encoding;
 mod error;
 mod models;
+mod normalized_string;
 mod normalizers;
 mod pre_tokenizers;
 mod processors;
@@ -70,6 +71,8 @@ fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
 #[pymodule]
 fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<tokenizer::Tokenizer>()?;
+    m.add_class::<encoding::Encoding>()?;
+    m.add_class::<normalized_string::NormalizedString>()?;
    m.add_wrapped(wrap_pymodule!(models))?;
    m.add_wrapped(wrap_pymodule!(pre_tokenizers))?;
    m.add_wrapped(wrap_pymodule!(decoders))?;
--- a/bindings/python/src/normalized_string.rs
+++ b/bindings/python/src/normalized_string.rs
@@ -0,0 +1,35 @@
+extern crate tokenizers as tk;
+
+use pyo3::prelude::*;
+
+#[pyclass]
+#[repr(transparent)]
+pub struct NormalizedString {
+    s: tk::tokenizer::NormalizedString,
+}
+impl NormalizedString {
+    pub fn new(s: tk::tokenizer::NormalizedString) -> NormalizedString {
+        NormalizedString { s }
+    }
+}
+
+#[pymethods]
+impl NormalizedString {
+    #[getter]
+    fn get_original(&self) -> String {
+        self.s.get_original().to_owned()
+    }
+
+    #[getter]
+    fn get_normalized(&self) -> String {
+        self.s.get().to_owned()
+    }
+
+    fn get_range(&self, start: usize, end: usize) -> Option<String> {
+        self.s.get_range(start..end).map(|s| s.to_owned())
+    }
+
+    fn get_range_original(&self, start: usize, end: usize) -> Option<String> {
+        self.s.get_range_original(start..end).map(|s| s.to_owned())
+    }
+}
--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@@ -9,10 +9,62 @@ from typing import Optional, Union, List, Tuple

 Offsets = Tuple[int, int]

+class NormalizedString:
+    """ A NormalizedString produced during normalization """
+
+    @property
+    def original(self) -> str:
+        """ The original string """
+        pass
+
+    @property
+    def normalized(self) -> str:
+        """ The normalized string """
+        pass
+
+    def get_range(self, start: int, end: int) -> Optional[str]:
+        """ Return a range of the normalized string, if the bounds are correct
+
+        Args:
+            start: int:
+                The starting offset in the string
+
+            end: int:
+                The ending offset in the string
+
+        Returns:
+            The substring if the bounds are correct
+        """
+        pass
+
+    def get_range_original(self, start: int, end: int) -> Optional[str]:
+        """ Return a range of the original string, if the bounds are correct
+
+        The given bounds are supposed to be after-normalization-offsets.
+        Provided with the `Encoding.offsets` associated with an `Encoding.ids` unit,
+        this method will return the part of the original string corresponding to the id.
+
+        Args:
+            start: int:
+                The starting offset in the normalized string
+
+            end: int:
+                The ending offset in the normalized string
+
+        Returns:
+            The substring if the bounds are correct
+        """
+        pass
+

 class Encoding:
    """ An Encoding as returned by the Tokenizer """

+    @property
+    def normalized(self) -> NormalizedString:
+        """ The NormalizedString """
+        pass
+
    @property
    def ids(self) -> List[int]:
        """ The tokenized ids """