Python - Add Model.encode_batch and improve typings

2025-08-23 00:35:35 +00:00 · 2020-03-24 15:54:51 -04:00
parent 2f310f3c25
commit eec74ca3e6
5 changed files with 162 additions and 45 deletions
--- a/bindings/python/Cargo.lock
+++ b/bindings/python/Cargo.lock
@ -575,6 +575,7 @@ name = "tokenizers-python"
 version = "0.6.0"
 dependencies = [
 "pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
 "rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "tokenizers 0.8.0",
 ]
--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@ -8,6 +8,9 @@ edition = "2018"
 name = "tokenizers"
 crate-type = ["cdylib"]
 [dependencies]
 rayon = "1.2.0"
 [dependencies.pyo3]
 version = "0.8.4"
 features = ["extension-module"]
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@ -6,42 +6,22 @@ use super::utils::Container;
 use pyo3::exceptions;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use rayon::prelude::*;
 use std::path::Path;
 /// A Model represents some tokenization algorithm like BPE or Word
 /// This class cannot be constructed directly. Please use one of the concrete models.
 #[pyclass]
-pub struct Model {
+struct EncodeInput {
-    pub model: Container<dyn tk::tokenizer::Model + Sync>,
+    sequence: Vec<(String, (usize, usize))>,
 }
 impl EncodeInput {
    pub fn into_input(self) -> Vec<(String, (usize, usize))> {
        self.sequence
    }
 }
-#[pymethods]
+impl<'source> FromPyObject<'source> for EncodeInput {
-impl Model {
+    fn extract(ob: &'source PyAny) -> PyResult<Self> {
-    #[new]
+        let sequence: &PyList = ob.downcast_ref()?;
    fn new(_obj: &PyRawObject) -> PyResult<()> {
        Err(exceptions::Exception::py_err(
            "Cannot create a Model directly. Use a concrete subclass",
        ))
    }
    fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
        let saved: PyResult<Vec<_>> = ToPyResult(
            self.model
                .execute(|model| model.save(Path::new(folder), name)),
        )
        .into();
        Ok(saved?
            .into_iter()
            .map(|path| path.to_string_lossy().into_owned())
            .collect())
    }
    #[args(type_id = 0)]
    fn encode(&self, sequence: &PyList, type_id: u32) -> PyResult<Encoding> {
        if sequence.is_empty() {
            return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
        }
        enum Mode {
            NoOffsets,
@ -90,6 +70,47 @@ impl Model {
            })
            .collect::<Result<Vec<_>, PyErr>>()?;
        Ok(EncodeInput { sequence })
    }
 }
 /// A Model represents some tokenization algorithm like BPE or Word
 /// This class cannot be constructed directly. Please use one of the concrete models.
 #[pyclass]
 pub struct Model {
    pub model: Container<dyn tk::tokenizer::Model + Sync>,
 }
 #[pymethods]
 impl Model {
    #[new]
    fn new(_obj: &PyRawObject) -> PyResult<()> {
        Err(exceptions::Exception::py_err(
            "Cannot create a Model directly. Use a concrete subclass",
        ))
    }
    fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
        let saved: PyResult<Vec<_>> = ToPyResult(
            self.model
                .execute(|model| model.save(Path::new(folder), name)),
        )
        .into();
        Ok(saved?
            .into_iter()
            .map(|path| path.to_string_lossy().into_owned())
            .collect())
    }
    #[args(type_id = 0)]
    fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
        let sequence = sequence.into_input();
        if sequence.is_empty() {
            return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
        }
        ToPyResult(self.model.execute(|model| {
            model
                .tokenize(sequence)
@ -97,6 +118,26 @@ impl Model {
        }))
        .into()
    }
    #[args(type_id = 0)]
    fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
        ToPyResult(self.model.execute(|model| {
            sequences
                .into_par_iter()
                .map(|sequence| {
                    let sequence = sequence.into_input();
                    if sequence.is_empty() {
                        Ok(Encoding::new(tk::tokenizer::Encoding::default()))
                    } else {
                        model.tokenize(sequence).map(|tokens| {
                            Encoding::new(tk::tokenizer::Encoding::from_tokens(tokens, type_id))
                        })
                    }
                })
                .collect::<Result<_, _>>()
        }))
        .into()
    }
 }
 /// BPE Model
--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@ -1,4 +1,5 @@
-from .. import Tokenizer, Encoding
+from tokenizers import Tokenizer, Encoding
 from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
 from typing import List, Union, Tuple, Optional
@ -139,15 +140,22 @@ class BaseTokenizer:
        return self._tokenizer.normalize(sequence)
    def encode_tokenized(
-        self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
+        self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
    ) -> Encoding:
-        """ Encode the given tokenized sequence. Let us skip the Normalizer and PreTokenizer
+        """ Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
-        by providing already tokenized substrings.
+        already tokenized substrings.
        A sequence can either be:
            - `TokenizedSequence`: (`List[str]`)
            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
            a Tuple[int, int].
        If the Offsets are not provided, they will be automatically generated, making the hypothesis
        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
        Args:
-            sequence: Union[List[str], List[Tuple[str, Offsets]]]:
+            sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
-                Either a list of strings, or a list of tuples (string, offsets) where offset
+                Either a TokenizedSequence or a TokenizedSequenceWithOffsets
                is a tuple (int, int)
            type_id: int:
                The type id of the given sequence
@ -157,6 +165,35 @@ class BaseTokenizer:
        """
        return self._tokenizer.model.encode(sequence)
    def encode_tokenized_batch(
        self,
        sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
        type_id: int = 0,
    ) -> List[Encoding]:
        """ Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
        providing already tokenized substrings.
        A sequence can either be:
            - `TokenizedSequence`: (`List[str]`)
            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
            a Tuple[int, int].
        If the Offsets are not provided, they will be automatically generated, making the hypothesis
        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
        Args:
            sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
                A list of sequence. Each sequence is either a TokenizedSequence or a
                TokenizedSequenceWithOffsets
            type_id: int:
                The type if of the given sequence
        Returns:
            A list of Encoding
        """
        return self._tokenizer.model.encode_batch(sequences)
    def encode(
        self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
    ) -> Encoding:
--- a/bindings/python/tokenizers/models/init.pyi
+++ b/bindings/python/tokenizers/models/init.pyi
@ -1,7 +1,8 @@
-from .. import Encoding
+from .. import Encoding, Offsets
 from typing import List, Optional, Union, Tuple
-Offsets = Tuple[int, int]
+TokenizedSequence = List[str]
 TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
 class Model:
    """ Base class for all models
@ -19,14 +20,21 @@ class Model:
        """
        pass
    def encode(
-        self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
+        self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
    ) -> Encoding:
-        """ Encode the given list of string or tuples (string, offsets)
+        """ Encode the given sequence.
        A sequence can either be:
            - `TokenizedSequence`: (`List[str]`)
            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
            a Tuple[int, int].
        If the Offsets are not provided, they will be automatically generated, making the hypothesis
        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
        Args:
-            sequence: Union[List[str], List[Tuple[str, Tuple[int, int]]]]:
+            sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
-                Either a list of strings, or a list of tuples (string, offsets) where offset
+                Either a TokenizedSequence or a TokenizedSequenceWithOffsets
                is a tuple (int, int)
            type_id: int:
                The type id of the given sequence
@ -35,6 +43,33 @@ class Model:
            An Encoding
        """
        pass
    def encode_batch(
        self,
        sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
        type_id: int = 0,
    ) -> List[Encoding]:
        """ Encode the given batch of sequence.
        A sequence can either be:
            - `TokenizedSequence`: (`List[str]`)
            - `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
            a Tuple[int, int].
        If the Offsets are not provided, they will be automatically generated, making the hypothesis
        that all the tokens in the `TokenizedSequence` are contiguous in the original string.
        Args:
            sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
                A list of sequence. Each sequence is either a TokenizedSequence or a
                TokenizedSequenceWithOffsets
            type_id: int:
                The type if of the given sequence
        Returns:
            A list of Encoding
        """
        pass
 class BPE(Model):
    """ BytePairEncoding model class """