mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Add Model.encode_batch and improve typings
This commit is contained in:
1
bindings/python/Cargo.lock
generated
1
bindings/python/Cargo.lock
generated
@ -575,6 +575,7 @@ name = "tokenizers-python"
|
||||
version = "0.6.0"
|
||||
dependencies = [
|
||||
"pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"tokenizers 0.8.0",
|
||||
]
|
||||
|
||||
|
@ -8,6 +8,9 @@ edition = "2018"
|
||||
name = "tokenizers"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
rayon = "1.2.0"
|
||||
|
||||
[dependencies.pyo3]
|
||||
version = "0.8.4"
|
||||
features = ["extension-module"]
|
||||
|
@ -6,42 +6,22 @@ use super::utils::Container;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use rayon::prelude::*;
|
||||
use std::path::Path;
|
||||
|
||||
/// A Model represents some tokenization algorithm like BPE or Word
|
||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||
#[pyclass]
|
||||
pub struct Model {
|
||||
pub model: Container<dyn tk::tokenizer::Model + Sync>,
|
||||
struct EncodeInput {
|
||||
sequence: Vec<(String, (usize, usize))>,
|
||||
}
|
||||
impl EncodeInput {
|
||||
pub fn into_input(self) -> Vec<(String, (usize, usize))> {
|
||||
self.sequence
|
||||
}
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Model {
|
||||
#[new]
|
||||
fn new(_obj: &PyRawObject) -> PyResult<()> {
|
||||
Err(exceptions::Exception::py_err(
|
||||
"Cannot create a Model directly. Use a concrete subclass",
|
||||
))
|
||||
}
|
||||
|
||||
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
|
||||
let saved: PyResult<Vec<_>> = ToPyResult(
|
||||
self.model
|
||||
.execute(|model| model.save(Path::new(folder), name)),
|
||||
)
|
||||
.into();
|
||||
|
||||
Ok(saved?
|
||||
.into_iter()
|
||||
.map(|path| path.to_string_lossy().into_owned())
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[args(type_id = 0)]
|
||||
fn encode(&self, sequence: &PyList, type_id: u32) -> PyResult<Encoding> {
|
||||
if sequence.is_empty() {
|
||||
return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
|
||||
}
|
||||
impl<'source> FromPyObject<'source> for EncodeInput {
|
||||
fn extract(ob: &'source PyAny) -> PyResult<Self> {
|
||||
let sequence: &PyList = ob.downcast_ref()?;
|
||||
|
||||
enum Mode {
|
||||
NoOffsets,
|
||||
@ -90,6 +70,47 @@ impl Model {
|
||||
})
|
||||
.collect::<Result<Vec<_>, PyErr>>()?;
|
||||
|
||||
Ok(EncodeInput { sequence })
|
||||
}
|
||||
}
|
||||
|
||||
/// A Model represents some tokenization algorithm like BPE or Word
|
||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||
#[pyclass]
|
||||
pub struct Model {
|
||||
pub model: Container<dyn tk::tokenizer::Model + Sync>,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl Model {
|
||||
#[new]
|
||||
fn new(_obj: &PyRawObject) -> PyResult<()> {
|
||||
Err(exceptions::Exception::py_err(
|
||||
"Cannot create a Model directly. Use a concrete subclass",
|
||||
))
|
||||
}
|
||||
|
||||
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
|
||||
let saved: PyResult<Vec<_>> = ToPyResult(
|
||||
self.model
|
||||
.execute(|model| model.save(Path::new(folder), name)),
|
||||
)
|
||||
.into();
|
||||
|
||||
Ok(saved?
|
||||
.into_iter()
|
||||
.map(|path| path.to_string_lossy().into_owned())
|
||||
.collect())
|
||||
}
|
||||
|
||||
#[args(type_id = 0)]
|
||||
fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
|
||||
let sequence = sequence.into_input();
|
||||
|
||||
if sequence.is_empty() {
|
||||
return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
|
||||
}
|
||||
|
||||
ToPyResult(self.model.execute(|model| {
|
||||
model
|
||||
.tokenize(sequence)
|
||||
@ -97,6 +118,26 @@ impl Model {
|
||||
}))
|
||||
.into()
|
||||
}
|
||||
|
||||
#[args(type_id = 0)]
|
||||
fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
|
||||
ToPyResult(self.model.execute(|model| {
|
||||
sequences
|
||||
.into_par_iter()
|
||||
.map(|sequence| {
|
||||
let sequence = sequence.into_input();
|
||||
if sequence.is_empty() {
|
||||
Ok(Encoding::new(tk::tokenizer::Encoding::default()))
|
||||
} else {
|
||||
model.tokenize(sequence).map(|tokens| {
|
||||
Encoding::new(tk::tokenizer::Encoding::from_tokens(tokens, type_id))
|
||||
})
|
||||
}
|
||||
})
|
||||
.collect::<Result<_, _>>()
|
||||
}))
|
||||
.into()
|
||||
}
|
||||
}
|
||||
|
||||
/// BPE Model
|
||||
|
@ -1,4 +1,5 @@
|
||||
from .. import Tokenizer, Encoding
|
||||
from tokenizers import Tokenizer, Encoding
|
||||
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
|
||||
|
||||
from typing import List, Union, Tuple, Optional
|
||||
|
||||
@ -139,15 +140,22 @@ class BaseTokenizer:
|
||||
return self._tokenizer.normalize(sequence)
|
||||
|
||||
def encode_tokenized(
|
||||
self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
|
||||
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
|
||||
) -> Encoding:
|
||||
""" Encode the given tokenized sequence. Let us skip the Normalizer and PreTokenizer
|
||||
by providing already tokenized substrings.
|
||||
""" Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
|
||||
already tokenized substrings.
|
||||
|
||||
A sequence can either be:
|
||||
- `TokenizedSequence`: (`List[str]`)
|
||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||
a Tuple[int, int].
|
||||
|
||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||
|
||||
Args:
|
||||
sequence: Union[List[str], List[Tuple[str, Offsets]]]:
|
||||
Either a list of strings, or a list of tuples (string, offsets) where offset
|
||||
is a tuple (int, int)
|
||||
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
|
||||
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
|
||||
|
||||
type_id: int:
|
||||
The type id of the given sequence
|
||||
@ -157,6 +165,35 @@ class BaseTokenizer:
|
||||
"""
|
||||
return self._tokenizer.model.encode(sequence)
|
||||
|
||||
def encode_tokenized_batch(
|
||||
self,
|
||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
|
||||
type_id: int = 0,
|
||||
) -> List[Encoding]:
|
||||
""" Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
|
||||
providing already tokenized substrings.
|
||||
|
||||
A sequence can either be:
|
||||
- `TokenizedSequence`: (`List[str]`)
|
||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||
a Tuple[int, int].
|
||||
|
||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||
|
||||
Args:
|
||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
|
||||
A list of sequence. Each sequence is either a TokenizedSequence or a
|
||||
TokenizedSequenceWithOffsets
|
||||
|
||||
type_id: int:
|
||||
The type if of the given sequence
|
||||
|
||||
Returns:
|
||||
A list of Encoding
|
||||
"""
|
||||
return self._tokenizer.model.encode_batch(sequences)
|
||||
|
||||
def encode(
|
||||
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
|
||||
) -> Encoding:
|
||||
|
@ -1,7 +1,8 @@
|
||||
from .. import Encoding
|
||||
from .. import Encoding, Offsets
|
||||
from typing import List, Optional, Union, Tuple
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
TokenizedSequence = List[str]
|
||||
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
|
||||
|
||||
class Model:
|
||||
""" Base class for all models
|
||||
@ -19,14 +20,21 @@ class Model:
|
||||
"""
|
||||
pass
|
||||
def encode(
|
||||
self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
|
||||
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
|
||||
) -> Encoding:
|
||||
""" Encode the given list of string or tuples (string, offsets)
|
||||
""" Encode the given sequence.
|
||||
|
||||
A sequence can either be:
|
||||
- `TokenizedSequence`: (`List[str]`)
|
||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||
a Tuple[int, int].
|
||||
|
||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||
|
||||
Args:
|
||||
sequence: Union[List[str], List[Tuple[str, Tuple[int, int]]]]:
|
||||
Either a list of strings, or a list of tuples (string, offsets) where offset
|
||||
is a tuple (int, int)
|
||||
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
|
||||
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
|
||||
|
||||
type_id: int:
|
||||
The type id of the given sequence
|
||||
@ -35,6 +43,33 @@ class Model:
|
||||
An Encoding
|
||||
"""
|
||||
pass
|
||||
def encode_batch(
|
||||
self,
|
||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
|
||||
type_id: int = 0,
|
||||
) -> List[Encoding]:
|
||||
""" Encode the given batch of sequence.
|
||||
|
||||
A sequence can either be:
|
||||
- `TokenizedSequence`: (`List[str]`)
|
||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||
a Tuple[int, int].
|
||||
|
||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||
|
||||
Args:
|
||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
|
||||
A list of sequence. Each sequence is either a TokenizedSequence or a
|
||||
TokenizedSequenceWithOffsets
|
||||
|
||||
type_id: int:
|
||||
The type if of the given sequence
|
||||
|
||||
Returns:
|
||||
A list of Encoding
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPE(Model):
|
||||
""" BytePairEncoding model class """
|
||||
|
Reference in New Issue
Block a user