Update Python bindings with new interface

2025-08-31 04:29:21 +00:00 · 2020-07-29 16:45:55 -04:00
parent 261a0c6dd8
commit 7833965dc4
8 changed files with 101 additions and 243 deletions
--- a/bindings/node/native/Cargo.lock
+++ b/bindings/node/native/Cargo.lock
@ -281,6 +281,15 @@ dependencies = [
 "either",
 ]

+[[package]]
+name = "itertools"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
+dependencies = [
+ "either",
+]
+
 [[package]]
 name = "itoa"
 version = "0.4.5"
@ -584,7 +593,7 @@ version = "0.1.0"
 source = "git+https://github.com/n1t0/rayon-cond#c56e4f1ded0fcb92eac70e0533703bba3ca2983f"
 dependencies = [
 "either",
- "itertools",
+ "itertools 0.8.2",
 "rayon",
 ]

@ -749,6 +758,7 @@ version = "0.10.1"
 dependencies = [
 "clap",
 "indicatif",
+ "itertools 0.9.0",
 "lazy_static",
 "onig",
 "rand",
--- a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
@ -1,5 +1,4 @@
 from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
-from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets

 from typing import List, Union, Tuple, Optional, Dict

--- a/bindings/python/py_src/tokenizers/models/init.py
+++ b/bindings/python/py_src/tokenizers/models/init.py
@ -2,9 +2,6 @@ from typing import List, Tuple

 from .. import models, Offsets

-TokenizedSequence = List[str]
-TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
-
 Model = models.Model
 BPE = models.BPE
 WordPiece = models.WordPiece
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@ -1,9 +1,6 @@
 from .. import Encoding, Offsets
 from typing import List, Optional, Union, Tuple

-TokenizedSequence = List[str]
-TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
-
 class Model:
    """ Base class for all models

@ -19,57 +16,6 @@ class Model:
        Any file with the same name that already exist in this folder will be overwritten.
        """
        pass
-    def encode(
-        self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
-    ) -> Encoding:
-        """ Encode the given sequence.
-
-        A sequence can either be:
-            - `TokenizedSequence`: (`List[str]`)
-            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
-            a Tuple[int, int].
-
-        If the Offsets are not provided, they will be automatically generated, making the hypothesis
-        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
-
-        Args:
-            sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
-                Either a TokenizedSequence or a TokenizedSequenceWithOffsets
-
-            type_id: int:
-                The type id of the given sequence
-
-        Returns:
-            An Encoding
-        """
-        pass
-    def encode_batch(
-        self,
-        sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
-        type_id: int = 0,
-    ) -> List[Encoding]:
-        """ Encode the given batch of sequence.
-
-        A sequence can either be:
-            - `TokenizedSequence`: (`List[str]`)
-            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
-            a Tuple[int, int].
-
-        If the Offsets are not provided, they will be automatically generated, making the hypothesis
-        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
-
-        Args:
-            sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
-                A list of sequence. Each sequence is either a TokenizedSequence or a
-                TokenizedSequenceWithOffsets
-
-            type_id: int:
-                The type if of the given sequence
-
-        Returns:
-            A list of Encoding
-        """
-        pass

 class BPE(Model):
    """BytePairEncoding model class
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@ -75,11 +75,7 @@ impl Encoding {
    #[args(growing_offsets = true)]
    fn merge(encodings: Vec<PyRef<Encoding>>, growing_offsets: bool) -> Encoding {
        tk::tokenizer::Encoding::merge(
-            encodings
-                .into_iter()
-                .map(|e| e.encoding.clone())
-                .collect::<Vec<_>>()
-                .as_slice(),
+            encodings.into_iter().map(|e| e.encoding.clone()),
            growing_offsets,
        )
        .into()
--- a/bindings/python/src/error.rs
+++ b/bindings/python/src/error.rs
@ -27,3 +27,8 @@ impl<T> std::convert::Into<PyResult<T>> for ToPyResult<T> {
            .map_err(|e| exceptions::Exception::py_err(format!("{}", e)))
    }
 }
+impl<T> ToPyResult<T> {
+    pub fn into_py(self) -> PyResult<T> {
+        self.into()
+    }
+}
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@ -1,78 +1,11 @@
 extern crate tokenizers as tk;

-use super::encoding::Encoding;
 use super::error::ToPyResult;
 use super::utils::Container;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use std::path::Path;
-use tk::parallelism::*;
-
-#[pyclass]
-struct EncodeInput {
-    sequence: Vec<(String, (usize, usize))>,
-}
-impl EncodeInput {
-    pub fn into_input(self) -> Vec<(String, (usize, usize))> {
-        self.sequence
-    }
-}
-
-impl<'source> FromPyObject<'source> for EncodeInput {
-    fn extract(ob: &'source PyAny) -> PyResult<Self> {
-        let sequence: &PyList = ob.downcast()?;
-
-        enum Mode {
-            NoOffsets,
-            Offsets,
-        };
-        let mode = sequence
-            .iter()
-            .next()
-            .map(|item| {
-                if item.extract::<String>().is_ok() {
-                    Ok(Mode::NoOffsets)
-                } else if item.extract::<(String, (usize, usize))>().is_ok() {
-                    Ok(Mode::Offsets)
-                } else {
-                    Err(exceptions::ValueError::py_err(
-                        "Input must be a list[str] or list[(str, (int, int))]",
-                    ))
-                }
-            })
-            .unwrap()?;
-
-        let mut total_len = 0;
-        let sequence = sequence
-            .iter()
-            .enumerate()
-            .map(|(i, item)| match mode {
-                Mode::NoOffsets => item
-                    .extract::<String>()
-                    .map_err(|_| {
-                        exceptions::ValueError::py_err(format!(
-                            "Value at index {} should be a `str`",
-                            i
-                        ))
-                    })
-                    .map(|s| {
-                        let len = s.chars().count();
-                        total_len += len;
-                        (s, (total_len - len, total_len))
-                    }),
-                Mode::Offsets => item.extract::<(String, (usize, usize))>().map_err(|_| {
-                    exceptions::ValueError::py_err(format!(
-                        "Value at index {} should be a `(str, (int, int))`",
-                        i
-                    ))
-                }),
-            })
-            .collect::<Result<Vec<_>, PyErr>>()?;
-
-        Ok(EncodeInput { sequence })
-    }
-}

 /// A Model represents some tokenization algorithm like BPE or Word
 /// This class cannot be constructed directly. Please use one of the concrete models.
@ -133,42 +66,6 @@ impl Model {
            .map(|path| path.to_string_lossy().into_owned())
            .collect())
    }
-
-    #[args(type_id = 0)]
-    fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
-        let sequence = sequence.into_input();
-
-        if sequence.is_empty() {
-            return Ok(tk::tokenizer::Encoding::default().into());
-        }
-
-        ToPyResult(self.model.execute(|model| {
-            model
-                .tokenize(sequence)
-                .map(|tokens| tk::tokenizer::Encoding::from_tokens(tokens, type_id).into())
-        }))
-        .into()
-    }
-
-    #[args(type_id = 0)]
-    fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
-        ToPyResult(self.model.execute(|model| {
-            sequences
-                .into_maybe_par_iter()
-                .map(|sequence| {
-                    let sequence = sequence.into_input();
-                    if sequence.is_empty() {
-                        Ok(tk::tokenizer::Encoding::default().into())
-                    } else {
-                        model.tokenize(sequence).map(|tokens| {
-                            tk::tokenizer::Encoding::from_tokens(tokens, type_id).into()
-                        })
-                    }
-                })
-                .collect::<Result<_, _>>()
-        }))
-        .into()
-    }
 }

 /// BPE Model
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@ -1,12 +1,11 @@
 extern crate tokenizers as tk;

-use super::error::{PyError, ToPyResult};
+use super::error::ToPyResult;
 use super::utils::Container;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
-use serde::{Deserialize, Deserializer, Serialize, Serializer};
-use tk::tokenizer::{Offsets, Result};
+use tk::tokenizer::Offsets;

 #[pyclass(dict, module = "tokenizers.pre_tokenizers")]
 pub struct PreTokenizer {
@ -14,13 +13,13 @@ pub struct PreTokenizer {
 }
 #[pymethods]
 impl PreTokenizer {
-    #[staticmethod]
-    fn custom(pretok: PyObject) -> PyResult<Self> {
-        let py_pretok = PyPreTokenizer::new(pretok)?;
-        Ok(PreTokenizer {
-            pretok: Container::Owned(Box::new(py_pretok)),
-        })
-    }
+    // #[staticmethod]
+    // fn custom(pretok: PyObject) -> PyResult<Self> {
+    //     let py_pretok = PyPreTokenizer::new(pretok)?;
+    //     Ok(PreTokenizer {
+    //         pretok: Container::Owned(Box::new(py_pretok)),
+    //     })
+    // }

    fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
        let data = self
@ -52,13 +51,20 @@ impl PreTokenizer {
    }

    fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
-        // TODO: Expose the NormalizedString
-        let mut normalized = tk::tokenizer::NormalizedString::from(s);
+        // TODO: Expose the PreTokenizedString
+        let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s);
+
        ToPyResult(
            self.pretok
-                .execute(|pretok| pretok.pre_tokenize(&mut normalized)),
+                .execute(|pretok| pretok.pre_tokenize(&mut pretokenized)),
        )
-        .into()
+        .into_py()?;
+
+        Ok(pretokenized
+            .get_normalized(true)
+            .into_iter()
+            .map(|(s, o)| (s.to_owned(), o))
+            .collect())
    }
 }

@ -108,7 +114,9 @@ impl Whitespace {
        Ok((
            Whitespace {},
            PreTokenizer {
-                pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)),
+                pretok: Container::Owned(Box::new(
+                    tk::pre_tokenizers::whitespace::Whitespace::default(),
+                )),
            },
        ))
    }
@ -209,64 +217,64 @@ impl Metaspace {
    }
 }

-struct PyPreTokenizer {
-    class: PyObject,
-}
-
-impl PyPreTokenizer {
-    pub fn new(class: PyObject) -> PyResult<Self> {
-        Ok(PyPreTokenizer { class })
-    }
-}
-
-#[typetag::serde]
-impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
-    fn pre_tokenize(
-        &self,
-        sentence: &mut tk::tokenizer::NormalizedString,
-    ) -> Result<Vec<(String, Offsets)>> {
-        let gil = Python::acquire_gil();
-        let py = gil.python();
-
-        let args = PyTuple::new(py, &[sentence.get()]);
-        match self.class.call_method(py, "pre_tokenize", args, None) {
-            Ok(res) => Ok(res
-                .cast_as::<PyList>(py)
-                .map_err(|_| {
-                    PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
-                })?
-                .extract::<Vec<(String, Offsets)>>()
-                .map_err(|_| {
-                    PyError::from(
-                        "`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
-                    )
-                })?),
-            Err(e) => {
-                e.print(py);
-                Err(Box::new(PyError::from(
-                    "Error while calling `pre_tokenize`",
-                )))
-            }
-        }
-    }
-}
-
-impl Serialize for PyPreTokenizer {
-    fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        Err(serde::ser::Error::custom(
-            "Custom PyPreTokenizer cannot be serialized",
-        ))
-    }
-}
-
-impl<'de> Deserialize<'de> for PyPreTokenizer {
-    fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
-    where
-        D: Deserializer<'de>,
-    {
-        unimplemented!("PyPreTokenizer cannot be deserialized")
-    }
-}
+// struct PyPreTokenizer {
+//     class: PyObject,
+// }
+//
+// impl PyPreTokenizer {
+//     pub fn new(class: PyObject) -> PyResult<Self> {
+//         Ok(PyPreTokenizer { class })
+//     }
+// }
+//
+// #[typetag::serde]
+// impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
+//     fn pre_tokenize(
+//         &self,
+//         sentence: &mut tk::tokenizer::NormalizedString,
+//     ) -> Result<Vec<(String, Offsets)>> {
+//         let gil = Python::acquire_gil();
+//         let py = gil.python();
+//
+//         let args = PyTuple::new(py, &[sentence.get()]);
+//         match self.class.call_method(py, "pre_tokenize", args, None) {
+//             Ok(res) => Ok(res
+//                 .cast_as::<PyList>(py)
+//                 .map_err(|_| {
+//                     PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
+//                 })?
+//                 .extract::<Vec<(String, Offsets)>>()
+//                 .map_err(|_| {
+//                     PyError::from(
+//                         "`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
+//                     )
+//                 })?),
+//             Err(e) => {
+//                 e.print(py);
+//                 Err(Box::new(PyError::from(
+//                     "Error while calling `pre_tokenize`",
+//                 )))
+//             }
+//         }
+//     }
+// }
+//
+// impl Serialize for PyPreTokenizer {
+//     fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
+//     where
+//         S: Serializer,
+//     {
+//         Err(serde::ser::Error::custom(
+//             "Custom PyPreTokenizer cannot be serialized",
+//         ))
+//     }
+// }
+//
+// impl<'de> Deserialize<'de> for PyPreTokenizer {
+//     fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
+//     where
+//         D: Deserializer<'de>,
+//     {
+//         unimplemented!("PyPreTokenizer cannot be deserialized")
+//     }
+// }