Update Python bindings with new interface

2025-08-31 04:29:21 +00:00 · 2020-07-29 16:45:55 -04:00
parent 261a0c6dd8
commit 7833965dc4
8 changed files with 101 additions and 243 deletions
--- a/bindings/node/native/Cargo.lock
+++ b/bindings/node/native/Cargo.lock
@ -281,6 +281,15 @@ dependencies = [
 "either",
 ]
 [[package]]
 name = "itertools"
 version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
 dependencies = [
 "either",
 ]
 [[package]]
 name = "itoa"
 version = "0.4.5"
@ -584,7 +593,7 @@ version = "0.1.0"
 source = "git+https://github.com/n1t0/rayon-cond#c56e4f1ded0fcb92eac70e0533703bba3ca2983f"
 dependencies = [
 "either",
- "itertools",
+ "itertools 0.8.2",
 "rayon",
 ]
@ -749,6 +758,7 @@ version = "0.10.1"
 dependencies = [
 "clap",
 "indicatif",
 "itertools 0.9.0",
 "lazy_static",
 "onig",
 "rand",
--- a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py
@ -1,5 +1,4 @@
 from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
 from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
 from typing import List, Union, Tuple, Optional, Dict
--- a/bindings/python/py_src/tokenizers/models/init.py
+++ b/bindings/python/py_src/tokenizers/models/init.py
@ -2,9 +2,6 @@ from typing import List, Tuple
 from .. import models, Offsets
 TokenizedSequence = List[str]
 TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
 Model = models.Model
 BPE = models.BPE
 WordPiece = models.WordPiece
--- a/bindings/python/py_src/tokenizers/models/init.pyi
+++ b/bindings/python/py_src/tokenizers/models/init.pyi
@ -1,9 +1,6 @@
 from .. import Encoding, Offsets
 from typing import List, Optional, Union, Tuple
 TokenizedSequence = List[str]
 TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
 class Model:
    """ Base class for all models
@ -19,57 +16,6 @@ class Model:
        Any file with the same name that already exist in this folder will be overwritten.
        """
        pass
    def encode(
        self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
    ) -> Encoding:
        """ Encode the given sequence.
        A sequence can either be:
            - `TokenizedSequence`: (`List[str]`)
            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
            a Tuple[int, int].
        If the Offsets are not provided, they will be automatically generated, making the hypothesis
        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
        Args:
            sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
                Either a TokenizedSequence or a TokenizedSequenceWithOffsets
            type_id: int:
                The type id of the given sequence
        Returns:
            An Encoding
        """
        pass
    def encode_batch(
        self,
        sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
        type_id: int = 0,
    ) -> List[Encoding]:
        """ Encode the given batch of sequence.
        A sequence can either be:
            - `TokenizedSequence`: (`List[str]`)
            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
            a Tuple[int, int].
        If the Offsets are not provided, they will be automatically generated, making the hypothesis
        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
        Args:
            sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
                A list of sequence. Each sequence is either a TokenizedSequence or a
                TokenizedSequenceWithOffsets
            type_id: int:
                The type if of the given sequence
        Returns:
            A list of Encoding
        """
        pass
 class BPE(Model):
    """BytePairEncoding model class
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@ -75,11 +75,7 @@ impl Encoding {
    #[args(growing_offsets = true)]
    fn merge(encodings: Vec<PyRef<Encoding>>, growing_offsets: bool) -> Encoding {
        tk::tokenizer::Encoding::merge(
-            encodings
+            encodings.into_iter().map(|e| e.encoding.clone()),
                .into_iter()
                .map(|e| e.encoding.clone())
                .collect::<Vec<_>>()
                .as_slice(),
            growing_offsets,
        )
        .into()
--- a/bindings/python/src/error.rs
+++ b/bindings/python/src/error.rs
@ -27,3 +27,8 @@ impl<T> std::convert::Into<PyResult<T>> for ToPyResult<T> {
            .map_err(|e| exceptions::Exception::py_err(format!("{}", e)))
    }
 }
 impl<T> ToPyResult<T> {
    pub fn into_py(self) -> PyResult<T> {
        self.into()
    }
 }
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@ -1,78 +1,11 @@
 extern crate tokenizers as tk;
 use super::encoding::Encoding;
 use super::error::ToPyResult;
 use super::utils::Container;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use std::path::Path;
 use tk::parallelism::*;
 #[pyclass]
 struct EncodeInput {
    sequence: Vec<(String, (usize, usize))>,
 }
 impl EncodeInput {
    pub fn into_input(self) -> Vec<(String, (usize, usize))> {
        self.sequence
    }
 }
 impl<'source> FromPyObject<'source> for EncodeInput {
    fn extract(ob: &'source PyAny) -> PyResult<Self> {
        let sequence: &PyList = ob.downcast()?;
        enum Mode {
            NoOffsets,
            Offsets,
        };
        let mode = sequence
            .iter()
            .next()
            .map(|item| {
                if item.extract::<String>().is_ok() {
                    Ok(Mode::NoOffsets)
                } else if item.extract::<(String, (usize, usize))>().is_ok() {
                    Ok(Mode::Offsets)
                } else {
                    Err(exceptions::ValueError::py_err(
                        "Input must be a list[str] or list[(str, (int, int))]",
                    ))
                }
            })
            .unwrap()?;
        let mut total_len = 0;
        let sequence = sequence
            .iter()
            .enumerate()
            .map(|(i, item)| match mode {
                Mode::NoOffsets => item
                    .extract::<String>()
                    .map_err(|_| {
                        exceptions::ValueError::py_err(format!(
                            "Value at index {} should be a `str`",
                            i
                        ))
                    })
                    .map(|s| {
                        let len = s.chars().count();
                        total_len += len;
                        (s, (total_len - len, total_len))
                    }),
                Mode::Offsets => item.extract::<(String, (usize, usize))>().map_err(|_| {
                    exceptions::ValueError::py_err(format!(
                        "Value at index {} should be a `(str, (int, int))`",
                        i
                    ))
                }),
            })
            .collect::<Result<Vec<_>, PyErr>>()?;
        Ok(EncodeInput { sequence })
    }
 }
 /// A Model represents some tokenization algorithm like BPE or Word
 /// This class cannot be constructed directly. Please use one of the concrete models.
@ -133,42 +66,6 @@ impl Model {
            .map(|path| path.to_string_lossy().into_owned())
            .collect())
    }
    #[args(type_id = 0)]
    fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
        let sequence = sequence.into_input();
        if sequence.is_empty() {
            return Ok(tk::tokenizer::Encoding::default().into());
        }
        ToPyResult(self.model.execute(|model| {
            model
                .tokenize(sequence)
                .map(|tokens| tk::tokenizer::Encoding::from_tokens(tokens, type_id).into())
        }))
        .into()
    }
    #[args(type_id = 0)]
    fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
        ToPyResult(self.model.execute(|model| {
            sequences
                .into_maybe_par_iter()
                .map(|sequence| {
                    let sequence = sequence.into_input();
                    if sequence.is_empty() {
                        Ok(tk::tokenizer::Encoding::default().into())
                    } else {
                        model.tokenize(sequence).map(|tokens| {
                            tk::tokenizer::Encoding::from_tokens(tokens, type_id).into()
                        })
                    }
                })
                .collect::<Result<_, _>>()
        }))
        .into()
    }
 }
 /// BPE Model
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@ -1,12 +1,11 @@
 extern crate tokenizers as tk;
-use super::error::{PyError, ToPyResult};
+use super::error::ToPyResult;
 use super::utils::Container;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
-use serde::{Deserialize, Deserializer, Serialize, Serializer};
+use tk::tokenizer::Offsets;
 use tk::tokenizer::{Offsets, Result};
 #[pyclass(dict, module = "tokenizers.pre_tokenizers")]
 pub struct PreTokenizer {
@ -14,13 +13,13 @@ pub struct PreTokenizer {
 }
 #[pymethods]
 impl PreTokenizer {
-    #[staticmethod]
+    // #[staticmethod]
-    fn custom(pretok: PyObject) -> PyResult<Self> {
+    // fn custom(pretok: PyObject) -> PyResult<Self> {
-        let py_pretok = PyPreTokenizer::new(pretok)?;
+    //     let py_pretok = PyPreTokenizer::new(pretok)?;
-        Ok(PreTokenizer {
+    //     Ok(PreTokenizer {
-            pretok: Container::Owned(Box::new(py_pretok)),
+    //         pretok: Container::Owned(Box::new(py_pretok)),
-        })
+    //     })
-    }
+    // }
    fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
        let data = self
@ -52,13 +51,20 @@ impl PreTokenizer {
    }
    fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
-        // TODO: Expose the NormalizedString
+        // TODO: Expose the PreTokenizedString
-        let mut normalized = tk::tokenizer::NormalizedString::from(s);
+        let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s);
        ToPyResult(
            self.pretok
-                .execute(|pretok| pretok.pre_tokenize(&mut normalized)),
+                .execute(|pretok| pretok.pre_tokenize(&mut pretokenized)),
        )
-        .into()
+        .into_py()?;
        Ok(pretokenized
            .get_normalized(true)
            .into_iter()
            .map(|(s, o)| (s.to_owned(), o))
            .collect())
    }
 }
@ -108,7 +114,9 @@ impl Whitespace {
        Ok((
            Whitespace {},
            PreTokenizer {
-                pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)),
+                pretok: Container::Owned(Box::new(
                    tk::pre_tokenizers::whitespace::Whitespace::default(),
                )),
            },
        ))
    }
@ -209,64 +217,64 @@ impl Metaspace {
    }
 }
-struct PyPreTokenizer {
+// struct PyPreTokenizer {
-    class: PyObject,
+//     class: PyObject,
-}
+// }
-
+//
-impl PyPreTokenizer {
+// impl PyPreTokenizer {
-    pub fn new(class: PyObject) -> PyResult<Self> {
+//     pub fn new(class: PyObject) -> PyResult<Self> {
-        Ok(PyPreTokenizer { class })
+//         Ok(PyPreTokenizer { class })
-    }
+//     }
-}
+// }
-
+//
-#[typetag::serde]
+// #[typetag::serde]
-impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
+// impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
-    fn pre_tokenize(
+//     fn pre_tokenize(
-        &self,
+//         &self,
-        sentence: &mut tk::tokenizer::NormalizedString,
+//         sentence: &mut tk::tokenizer::NormalizedString,
-    ) -> Result<Vec<(String, Offsets)>> {
+//     ) -> Result<Vec<(String, Offsets)>> {
-        let gil = Python::acquire_gil();
+//         let gil = Python::acquire_gil();
-        let py = gil.python();
+//         let py = gil.python();
-
+//
-        let args = PyTuple::new(py, &[sentence.get()]);
+//         let args = PyTuple::new(py, &[sentence.get()]);
-        match self.class.call_method(py, "pre_tokenize", args, None) {
+//         match self.class.call_method(py, "pre_tokenize", args, None) {
-            Ok(res) => Ok(res
+//             Ok(res) => Ok(res
-                .cast_as::<PyList>(py)
+//                 .cast_as::<PyList>(py)
-                .map_err(|_| {
+//                 .map_err(|_| {
-                    PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
+//                     PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
-                })?
+//                 })?
-                .extract::<Vec<(String, Offsets)>>()
+//                 .extract::<Vec<(String, Offsets)>>()
-                .map_err(|_| {
+//                 .map_err(|_| {
-                    PyError::from(
+//                     PyError::from(
-                        "`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
+//                         "`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
-                    )
+//                     )
-                })?),
+//                 })?),
-            Err(e) => {
+//             Err(e) => {
-                e.print(py);
+//                 e.print(py);
-                Err(Box::new(PyError::from(
+//                 Err(Box::new(PyError::from(
-                    "Error while calling `pre_tokenize`",
+//                     "Error while calling `pre_tokenize`",
-                )))
+//                 )))
-            }
+//             }
-        }
+//         }
-    }
+//     }
-}
+// }
-
+//
-impl Serialize for PyPreTokenizer {
+// impl Serialize for PyPreTokenizer {
-    fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
+//     fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
-    where
+//     where
-        S: Serializer,
+//         S: Serializer,
-    {
+//     {
-        Err(serde::ser::Error::custom(
+//         Err(serde::ser::Error::custom(
-            "Custom PyPreTokenizer cannot be serialized",
+//             "Custom PyPreTokenizer cannot be serialized",
-        ))
+//         ))
-    }
+//     }
-}
+// }
-
+//
-impl<'de> Deserialize<'de> for PyPreTokenizer {
+// impl<'de> Deserialize<'de> for PyPreTokenizer {
-    fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
+//     fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
-    where
+//     where
-        D: Deserializer<'de>,
+//         D: Deserializer<'de>,
-    {
+//     {
-        unimplemented!("PyPreTokenizer cannot be deserialized")
+//         unimplemented!("PyPreTokenizer cannot be deserialized")
-    }
+//     }
-}
+// }