Python - Add Model.encode_batch and improve typings

2025-08-22 16:25:30 +00:00 · 2020-03-24 15:54:51 -04:00
parent 2f310f3c25
commit eec74ca3e6
5 changed files with 162 additions and 45 deletions
--- a/bindings/python/Cargo.lock
+++ b/bindings/python/Cargo.lock
@ -575,6 +575,7 @@ name = "tokenizers-python"
 version = "0.6.0"
 dependencies = [
 "pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "tokenizers 0.8.0",
 ]

--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@ -8,6 +8,9 @@ edition = "2018"
 name = "tokenizers"
 crate-type = ["cdylib"]

+[dependencies]
+rayon = "1.2.0"
+
 [dependencies.pyo3]
 version = "0.8.4"
 features = ["extension-module"]
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@ -6,42 +6,22 @@ use super::utils::Container;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
+use rayon::prelude::*;
 use std::path::Path;

-/// A Model represents some tokenization algorithm like BPE or Word
-/// This class cannot be constructed directly. Please use one of the concrete models.
 #[pyclass]
-pub struct Model {
-    pub model: Container<dyn tk::tokenizer::Model + Sync>,
+struct EncodeInput {
+    sequence: Vec<(String, (usize, usize))>,
+}
+impl EncodeInput {
+    pub fn into_input(self) -> Vec<(String, (usize, usize))> {
+        self.sequence
+    }
 }

-#[pymethods]
-impl Model {
-    #[new]
-    fn new(_obj: &PyRawObject) -> PyResult<()> {
-        Err(exceptions::Exception::py_err(
-            "Cannot create a Model directly. Use a concrete subclass",
-        ))
-    }
-
-    fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
-        let saved: PyResult<Vec<_>> = ToPyResult(
-            self.model
-                .execute(|model| model.save(Path::new(folder), name)),
-        )
-        .into();
-
-        Ok(saved?
-            .into_iter()
-            .map(|path| path.to_string_lossy().into_owned())
-            .collect())
-    }
-
-    #[args(type_id = 0)]
-    fn encode(&self, sequence: &PyList, type_id: u32) -> PyResult<Encoding> {
-        if sequence.is_empty() {
-            return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
-        }
+impl<'source> FromPyObject<'source> for EncodeInput {
+    fn extract(ob: &'source PyAny) -> PyResult<Self> {
+        let sequence: &PyList = ob.downcast_ref()?;

        enum Mode {
            NoOffsets,
@ -90,6 +70,47 @@ impl Model {
            })
            .collect::<Result<Vec<_>, PyErr>>()?;

+        Ok(EncodeInput { sequence })
+    }
+}
+
+/// A Model represents some tokenization algorithm like BPE or Word
+/// This class cannot be constructed directly. Please use one of the concrete models.
+#[pyclass]
+pub struct Model {
+    pub model: Container<dyn tk::tokenizer::Model + Sync>,
+}
+
+#[pymethods]
+impl Model {
+    #[new]
+    fn new(_obj: &PyRawObject) -> PyResult<()> {
+        Err(exceptions::Exception::py_err(
+            "Cannot create a Model directly. Use a concrete subclass",
+        ))
+    }
+
+    fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
+        let saved: PyResult<Vec<_>> = ToPyResult(
+            self.model
+                .execute(|model| model.save(Path::new(folder), name)),
+        )
+        .into();
+
+        Ok(saved?
+            .into_iter()
+            .map(|path| path.to_string_lossy().into_owned())
+            .collect())
+    }
+
+    #[args(type_id = 0)]
+    fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
+        let sequence = sequence.into_input();
+
+        if sequence.is_empty() {
+            return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
+        }
+
        ToPyResult(self.model.execute(|model| {
            model
                .tokenize(sequence)
@ -97,6 +118,26 @@ impl Model {
        }))
        .into()
    }
+
+    #[args(type_id = 0)]
+    fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
+        ToPyResult(self.model.execute(|model| {
+            sequences
+                .into_par_iter()
+                .map(|sequence| {
+                    let sequence = sequence.into_input();
+                    if sequence.is_empty() {
+                        Ok(Encoding::new(tk::tokenizer::Encoding::default()))
+                    } else {
+                        model.tokenize(sequence).map(|tokens| {
+                            Encoding::new(tk::tokenizer::Encoding::from_tokens(tokens, type_id))
+                        })
+                    }
+                })
+                .collect::<Result<_, _>>()
+        }))
+        .into()
+    }
 }

 /// BPE Model
--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@ -1,4 +1,5 @@
-from .. import Tokenizer, Encoding
+from tokenizers import Tokenizer, Encoding
+from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets

 from typing import List, Union, Tuple, Optional

@ -139,15 +140,22 @@ class BaseTokenizer:
        return self._tokenizer.normalize(sequence)

    def encode_tokenized(
-        self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
+        self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
    ) -> Encoding:
-        """ Encode the given tokenized sequence. Let us skip the Normalizer and PreTokenizer
-        by providing already tokenized substrings.
+        """ Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
+        already tokenized substrings.
+
+        A sequence can either be:
+            - `TokenizedSequence`: (`List[str]`)
+            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
+            a Tuple[int, int].
+
+        If the Offsets are not provided, they will be automatically generated, making the hypothesis
+        that all the tokens in the `TokenizedSequence` are contiguous in the original string.

        Args:
-            sequence: Union[List[str], List[Tuple[str, Offsets]]]:
-                Either a list of strings, or a list of tuples (string, offsets) where offset
-                is a tuple (int, int)
+            sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
+                Either a TokenizedSequence or a TokenizedSequenceWithOffsets

            type_id: int:
                The type id of the given sequence
@ -157,6 +165,35 @@ class BaseTokenizer:
        """
        return self._tokenizer.model.encode(sequence)

+    def encode_tokenized_batch(
+        self,
+        sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
+        type_id: int = 0,
+    ) -> List[Encoding]:
+        """ Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
+        providing already tokenized substrings.
+
+        A sequence can either be:
+            - `TokenizedSequence`: (`List[str]`)
+            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
+            a Tuple[int, int].
+
+        If the Offsets are not provided, they will be automatically generated, making the hypothesis
+        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
+
+        Args:
+            sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
+                A list of sequence. Each sequence is either a TokenizedSequence or a
+                TokenizedSequenceWithOffsets
+
+            type_id: int:
+                The type if of the given sequence
+
+        Returns:
+            A list of Encoding
+        """
+        return self._tokenizer.model.encode_batch(sequences)
+
    def encode(
        self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
    ) -> Encoding:
--- a/bindings/python/tokenizers/models/init.pyi
+++ b/bindings/python/tokenizers/models/init.pyi
@ -1,7 +1,8 @@
-from .. import Encoding
+from .. import Encoding, Offsets
 from typing import List, Optional, Union, Tuple

-Offsets = Tuple[int, int]
+TokenizedSequence = List[str]
+TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]

 class Model:
    """ Base class for all models
@ -19,14 +20,21 @@ class Model:
        """
        pass
    def encode(
-        self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
+        self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
    ) -> Encoding:
-        """ Encode the given list of string or tuples (string, offsets)
+        """ Encode the given sequence.
+
+        A sequence can either be:
+            - `TokenizedSequence`: (`List[str]`)
+            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
+            a Tuple[int, int].
+
+        If the Offsets are not provided, they will be automatically generated, making the hypothesis
+        that all the tokens in the `TokenizedSequence` are contiguous in the original string.

        Args:
-            sequence: Union[List[str], List[Tuple[str, Tuple[int, int]]]]:
-                Either a list of strings, or a list of tuples (string, offsets) where offset
-                is a tuple (int, int)
+            sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
+                Either a TokenizedSequence or a TokenizedSequenceWithOffsets

            type_id: int:
                The type id of the given sequence
@ -35,6 +43,33 @@ class Model:
            An Encoding
        """
        pass
+    def encode_batch(
+        self,
+        sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
+        type_id: int = 0,
+    ) -> List[Encoding]:
+        """ Encode the given batch of sequence.
+
+        A sequence can either be:
+            - `TokenizedSequence`: (`List[str]`)
+            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
+            a Tuple[int, int].
+
+        If the Offsets are not provided, they will be automatically generated, making the hypothesis
+        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
+
+        Args:
+            sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
+                A list of sequence. Each sequence is either a TokenizedSequence or a
+                TokenizedSequenceWithOffsets
+
+            type_id: int:
+                The type if of the given sequence
+
+        Returns:
+            A list of Encoding
+        """
+        pass

 class BPE(Model):
    """ BytePairEncoding model class """