Python - Add Model.encode_batch and improve typings

This commit is contained in:
Anthony MOI
2020-03-24 15:54:51 -04:00
parent 2f310f3c25
commit eec74ca3e6
5 changed files with 162 additions and 45 deletions

View File

@ -575,6 +575,7 @@ name = "tokenizers-python"
version = "0.6.0"
dependencies = [
"pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"tokenizers 0.8.0",
]

View File

@ -8,6 +8,9 @@ edition = "2018"
name = "tokenizers"
crate-type = ["cdylib"]
[dependencies]
rayon = "1.2.0"
[dependencies.pyo3]
version = "0.8.4"
features = ["extension-module"]

View File

@ -6,42 +6,22 @@ use super::utils::Container;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use rayon::prelude::*;
use std::path::Path;
/// A Model represents some tokenization algorithm like BPE or Word
/// This class cannot be constructed directly. Please use one of the concrete models.
#[pyclass]
pub struct Model {
pub model: Container<dyn tk::tokenizer::Model + Sync>,
struct EncodeInput {
sequence: Vec<(String, (usize, usize))>,
}
impl EncodeInput {
pub fn into_input(self) -> Vec<(String, (usize, usize))> {
self.sequence
}
}
#[pymethods]
impl Model {
#[new]
fn new(_obj: &PyRawObject) -> PyResult<()> {
Err(exceptions::Exception::py_err(
"Cannot create a Model directly. Use a concrete subclass",
))
}
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
let saved: PyResult<Vec<_>> = ToPyResult(
self.model
.execute(|model| model.save(Path::new(folder), name)),
)
.into();
Ok(saved?
.into_iter()
.map(|path| path.to_string_lossy().into_owned())
.collect())
}
#[args(type_id = 0)]
fn encode(&self, sequence: &PyList, type_id: u32) -> PyResult<Encoding> {
if sequence.is_empty() {
return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
}
impl<'source> FromPyObject<'source> for EncodeInput {
fn extract(ob: &'source PyAny) -> PyResult<Self> {
let sequence: &PyList = ob.downcast_ref()?;
enum Mode {
NoOffsets,
@ -90,6 +70,47 @@ impl Model {
})
.collect::<Result<Vec<_>, PyErr>>()?;
Ok(EncodeInput { sequence })
}
}
/// A Model represents some tokenization algorithm like BPE or Word
/// This class cannot be constructed directly. Please use one of the concrete models.
#[pyclass]
pub struct Model {
pub model: Container<dyn tk::tokenizer::Model + Sync>,
}
#[pymethods]
impl Model {
#[new]
fn new(_obj: &PyRawObject) -> PyResult<()> {
Err(exceptions::Exception::py_err(
"Cannot create a Model directly. Use a concrete subclass",
))
}
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
let saved: PyResult<Vec<_>> = ToPyResult(
self.model
.execute(|model| model.save(Path::new(folder), name)),
)
.into();
Ok(saved?
.into_iter()
.map(|path| path.to_string_lossy().into_owned())
.collect())
}
#[args(type_id = 0)]
fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
let sequence = sequence.into_input();
if sequence.is_empty() {
return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
}
ToPyResult(self.model.execute(|model| {
model
.tokenize(sequence)
@ -97,6 +118,26 @@ impl Model {
}))
.into()
}
#[args(type_id = 0)]
fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
ToPyResult(self.model.execute(|model| {
sequences
.into_par_iter()
.map(|sequence| {
let sequence = sequence.into_input();
if sequence.is_empty() {
Ok(Encoding::new(tk::tokenizer::Encoding::default()))
} else {
model.tokenize(sequence).map(|tokens| {
Encoding::new(tk::tokenizer::Encoding::from_tokens(tokens, type_id))
})
}
})
.collect::<Result<_, _>>()
}))
.into()
}
}
/// BPE Model

View File

@ -1,4 +1,5 @@
from .. import Tokenizer, Encoding
from tokenizers import Tokenizer, Encoding
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
from typing import List, Union, Tuple, Optional
@ -139,15 +140,22 @@ class BaseTokenizer:
return self._tokenizer.normalize(sequence)
def encode_tokenized(
self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
) -> Encoding:
""" Encode the given tokenized sequence. Let us skip the Normalizer and PreTokenizer
by providing already tokenized substrings.
""" Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
already tokenized substrings.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequence: Union[List[str], List[Tuple[str, Offsets]]]:
Either a list of strings, or a list of tuples (string, offsets) where offset
is a tuple (int, int)
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
type_id: int:
The type id of the given sequence
@ -157,6 +165,35 @@ class BaseTokenizer:
"""
return self._tokenizer.model.encode(sequence)
def encode_tokenized_batch(
self,
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
type_id: int = 0,
) -> List[Encoding]:
""" Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
providing already tokenized substrings.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
A list of sequence. Each sequence is either a TokenizedSequence or a
TokenizedSequenceWithOffsets
type_id: int:
The type if of the given sequence
Returns:
A list of Encoding
"""
return self._tokenizer.model.encode_batch(sequences)
def encode(
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
) -> Encoding:

View File

@ -1,7 +1,8 @@
from .. import Encoding
from .. import Encoding, Offsets
from typing import List, Optional, Union, Tuple
Offsets = Tuple[int, int]
TokenizedSequence = List[str]
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
class Model:
""" Base class for all models
@ -19,14 +20,21 @@ class Model:
"""
pass
def encode(
self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
) -> Encoding:
""" Encode the given list of string or tuples (string, offsets)
""" Encode the given sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequence: Union[List[str], List[Tuple[str, Tuple[int, int]]]]:
Either a list of strings, or a list of tuples (string, offsets) where offset
is a tuple (int, int)
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
type_id: int:
The type id of the given sequence
@ -35,6 +43,33 @@ class Model:
An Encoding
"""
pass
def encode_batch(
self,
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
type_id: int = 0,
) -> List[Encoding]:
""" Encode the given batch of sequence.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
A list of sequence. Each sequence is either a TokenizedSequence or a
TokenizedSequenceWithOffsets
type_id: int:
The type if of the given sequence
Returns:
A list of Encoding
"""
pass
class BPE(Model):
""" BytePairEncoding model class """