Python - Add Model.encode_batch and improve typings

This commit is contained in:
Anthony MOI
2020-03-24 15:54:51 -04:00
parent 2f310f3c25
commit eec74ca3e6
5 changed files with 162 additions and 45 deletions

View File

@ -575,6 +575,7 @@ name = "tokenizers-python"
version = "0.6.0" version = "0.6.0"
dependencies = [ dependencies = [
"pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)", "pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"tokenizers 0.8.0", "tokenizers 0.8.0",
] ]

View File

@ -8,6 +8,9 @@ edition = "2018"
name = "tokenizers" name = "tokenizers"
crate-type = ["cdylib"] crate-type = ["cdylib"]
[dependencies]
rayon = "1.2.0"
[dependencies.pyo3] [dependencies.pyo3]
version = "0.8.4" version = "0.8.4"
features = ["extension-module"] features = ["extension-module"]

View File

@ -6,42 +6,22 @@ use super::utils::Container;
use pyo3::exceptions; use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::types::*; use pyo3::types::*;
use rayon::prelude::*;
use std::path::Path; use std::path::Path;
/// A Model represents some tokenization algorithm like BPE or Word
/// This class cannot be constructed directly. Please use one of the concrete models.
#[pyclass] #[pyclass]
pub struct Model { struct EncodeInput {
pub model: Container<dyn tk::tokenizer::Model + Sync>, sequence: Vec<(String, (usize, usize))>,
}
impl EncodeInput {
pub fn into_input(self) -> Vec<(String, (usize, usize))> {
self.sequence
}
} }
#[pymethods] impl<'source> FromPyObject<'source> for EncodeInput {
impl Model { fn extract(ob: &'source PyAny) -> PyResult<Self> {
#[new] let sequence: &PyList = ob.downcast_ref()?;
fn new(_obj: &PyRawObject) -> PyResult<()> {
Err(exceptions::Exception::py_err(
"Cannot create a Model directly. Use a concrete subclass",
))
}
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
let saved: PyResult<Vec<_>> = ToPyResult(
self.model
.execute(|model| model.save(Path::new(folder), name)),
)
.into();
Ok(saved?
.into_iter()
.map(|path| path.to_string_lossy().into_owned())
.collect())
}
#[args(type_id = 0)]
fn encode(&self, sequence: &PyList, type_id: u32) -> PyResult<Encoding> {
if sequence.is_empty() {
return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
}
enum Mode { enum Mode {
NoOffsets, NoOffsets,
@ -90,6 +70,47 @@ impl Model {
}) })
.collect::<Result<Vec<_>, PyErr>>()?; .collect::<Result<Vec<_>, PyErr>>()?;
Ok(EncodeInput { sequence })
}
}
/// A Model represents some tokenization algorithm like BPE or Word
/// This class cannot be constructed directly. Please use one of the concrete models.
#[pyclass]
pub struct Model {
pub model: Container<dyn tk::tokenizer::Model + Sync>,
}
#[pymethods]
impl Model {
#[new]
fn new(_obj: &PyRawObject) -> PyResult<()> {
Err(exceptions::Exception::py_err(
"Cannot create a Model directly. Use a concrete subclass",
))
}
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
let saved: PyResult<Vec<_>> = ToPyResult(
self.model
.execute(|model| model.save(Path::new(folder), name)),
)
.into();
Ok(saved?
.into_iter()
.map(|path| path.to_string_lossy().into_owned())
.collect())
}
#[args(type_id = 0)]
fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
let sequence = sequence.into_input();
if sequence.is_empty() {
return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
}
ToPyResult(self.model.execute(|model| { ToPyResult(self.model.execute(|model| {
model model
.tokenize(sequence) .tokenize(sequence)
@ -97,6 +118,26 @@ impl Model {
})) }))
.into() .into()
} }
#[args(type_id = 0)]
fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
ToPyResult(self.model.execute(|model| {
sequences
.into_par_iter()
.map(|sequence| {
let sequence = sequence.into_input();
if sequence.is_empty() {
Ok(Encoding::new(tk::tokenizer::Encoding::default()))
} else {
model.tokenize(sequence).map(|tokens| {
Encoding::new(tk::tokenizer::Encoding::from_tokens(tokens, type_id))
})
}
})
.collect::<Result<_, _>>()
}))
.into()
}
} }
/// BPE Model /// BPE Model

View File

@ -1,4 +1,5 @@
from .. import Tokenizer, Encoding from tokenizers import Tokenizer, Encoding
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
from typing import List, Union, Tuple, Optional from typing import List, Union, Tuple, Optional
@ -139,15 +140,22 @@ class BaseTokenizer:
return self._tokenizer.normalize(sequence) return self._tokenizer.normalize(sequence)
def encode_tokenized( def encode_tokenized(
self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0 self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
) -> Encoding: ) -> Encoding:
""" Encode the given tokenized sequence. Let us skip the Normalizer and PreTokenizer """ Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
by providing already tokenized substrings. already tokenized substrings.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args: Args:
sequence: Union[List[str], List[Tuple[str, Offsets]]]: sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
Either a list of strings, or a list of tuples (string, offsets) where offset Either a TokenizedSequence or a TokenizedSequenceWithOffsets
is a tuple (int, int)
type_id: int: type_id: int:
The type id of the given sequence The type id of the given sequence
@ -157,6 +165,35 @@ class BaseTokenizer:
""" """
return self._tokenizer.model.encode(sequence) return self._tokenizer.model.encode(sequence)
def encode_tokenized_batch(
self,
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
type_id: int = 0,
) -> List[Encoding]:
""" Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
providing already tokenized substrings.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
A list of sequence. Each sequence is either a TokenizedSequence or a
TokenizedSequenceWithOffsets
type_id: int:
The type if of the given sequence
Returns:
A list of Encoding
"""
return self._tokenizer.model.encode_batch(sequences)
def encode( def encode(
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
) -> Encoding: ) -> Encoding:

View File

@ -1,7 +1,8 @@
from .. import Encoding from .. import Encoding, Offsets
from typing import List, Optional, Union, Tuple from typing import List, Optional, Union, Tuple
Offsets = Tuple[int, int] TokenizedSequence = List[str]
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
class Model: class Model:
""" Base class for all models """ Base class for all models
@ -19,14 +20,21 @@ class Model:
""" """
pass pass
def encode( def encode(
self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0 self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
) -> Encoding: ) -> Encoding:
""" Encode the given list of string or tuples (string, offsets) """ Encode the given sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args: Args:
sequence: Union[List[str], List[Tuple[str, Tuple[int, int]]]]: sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
Either a list of strings, or a list of tuples (string, offsets) where offset Either a TokenizedSequence or a TokenizedSequenceWithOffsets
is a tuple (int, int)
type_id: int: type_id: int:
The type id of the given sequence The type id of the given sequence
@ -35,6 +43,33 @@ class Model:
An Encoding An Encoding
""" """
pass pass
def encode_batch(
self,
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
type_id: int = 0,
) -> List[Encoding]:
""" Encode the given batch of sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
A list of sequence. Each sequence is either a TokenizedSequence or a
TokenizedSequenceWithOffsets
type_id: int:
The type if of the given sequence
Returns:
A list of Encoding
"""
pass
class BPE(Model): class BPE(Model):
""" BytePairEncoding model class """ """ BytePairEncoding model class """