mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Python - Add Model.encode_batch and improve typings
This commit is contained in:
1
bindings/python/Cargo.lock
generated
1
bindings/python/Cargo.lock
generated
@ -575,6 +575,7 @@ name = "tokenizers-python"
|
|||||||
version = "0.6.0"
|
version = "0.6.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
"pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"tokenizers 0.8.0",
|
"tokenizers 0.8.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -8,6 +8,9 @@ edition = "2018"
|
|||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
crate-type = ["cdylib"]
|
crate-type = ["cdylib"]
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
rayon = "1.2.0"
|
||||||
|
|
||||||
[dependencies.pyo3]
|
[dependencies.pyo3]
|
||||||
version = "0.8.4"
|
version = "0.8.4"
|
||||||
features = ["extension-module"]
|
features = ["extension-module"]
|
||||||
|
@ -6,42 +6,22 @@ use super::utils::Container;
|
|||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
|
use rayon::prelude::*;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
/// A Model represents some tokenization algorithm like BPE or Word
|
|
||||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct Model {
|
struct EncodeInput {
|
||||||
pub model: Container<dyn tk::tokenizer::Model + Sync>,
|
sequence: Vec<(String, (usize, usize))>,
|
||||||
|
}
|
||||||
|
impl EncodeInput {
|
||||||
|
pub fn into_input(self) -> Vec<(String, (usize, usize))> {
|
||||||
|
self.sequence
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
impl<'source> FromPyObject<'source> for EncodeInput {
|
||||||
impl Model {
|
fn extract(ob: &'source PyAny) -> PyResult<Self> {
|
||||||
#[new]
|
let sequence: &PyList = ob.downcast_ref()?;
|
||||||
fn new(_obj: &PyRawObject) -> PyResult<()> {
|
|
||||||
Err(exceptions::Exception::py_err(
|
|
||||||
"Cannot create a Model directly. Use a concrete subclass",
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
|
|
||||||
let saved: PyResult<Vec<_>> = ToPyResult(
|
|
||||||
self.model
|
|
||||||
.execute(|model| model.save(Path::new(folder), name)),
|
|
||||||
)
|
|
||||||
.into();
|
|
||||||
|
|
||||||
Ok(saved?
|
|
||||||
.into_iter()
|
|
||||||
.map(|path| path.to_string_lossy().into_owned())
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[args(type_id = 0)]
|
|
||||||
fn encode(&self, sequence: &PyList, type_id: u32) -> PyResult<Encoding> {
|
|
||||||
if sequence.is_empty() {
|
|
||||||
return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
|
|
||||||
}
|
|
||||||
|
|
||||||
enum Mode {
|
enum Mode {
|
||||||
NoOffsets,
|
NoOffsets,
|
||||||
@ -90,6 +70,47 @@ impl Model {
|
|||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>, PyErr>>()?;
|
.collect::<Result<Vec<_>, PyErr>>()?;
|
||||||
|
|
||||||
|
Ok(EncodeInput { sequence })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A Model represents some tokenization algorithm like BPE or Word
|
||||||
|
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
|
#[pyclass]
|
||||||
|
pub struct Model {
|
||||||
|
pub model: Container<dyn tk::tokenizer::Model + Sync>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl Model {
|
||||||
|
#[new]
|
||||||
|
fn new(_obj: &PyRawObject) -> PyResult<()> {
|
||||||
|
Err(exceptions::Exception::py_err(
|
||||||
|
"Cannot create a Model directly. Use a concrete subclass",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn save(&self, folder: &str, name: Option<&str>) -> PyResult<Vec<String>> {
|
||||||
|
let saved: PyResult<Vec<_>> = ToPyResult(
|
||||||
|
self.model
|
||||||
|
.execute(|model| model.save(Path::new(folder), name)),
|
||||||
|
)
|
||||||
|
.into();
|
||||||
|
|
||||||
|
Ok(saved?
|
||||||
|
.into_iter()
|
||||||
|
.map(|path| path.to_string_lossy().into_owned())
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[args(type_id = 0)]
|
||||||
|
fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
|
||||||
|
let sequence = sequence.into_input();
|
||||||
|
|
||||||
|
if sequence.is_empty() {
|
||||||
|
return Ok(Encoding::new(tk::tokenizer::Encoding::default()));
|
||||||
|
}
|
||||||
|
|
||||||
ToPyResult(self.model.execute(|model| {
|
ToPyResult(self.model.execute(|model| {
|
||||||
model
|
model
|
||||||
.tokenize(sequence)
|
.tokenize(sequence)
|
||||||
@ -97,6 +118,26 @@ impl Model {
|
|||||||
}))
|
}))
|
||||||
.into()
|
.into()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[args(type_id = 0)]
|
||||||
|
fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
|
||||||
|
ToPyResult(self.model.execute(|model| {
|
||||||
|
sequences
|
||||||
|
.into_par_iter()
|
||||||
|
.map(|sequence| {
|
||||||
|
let sequence = sequence.into_input();
|
||||||
|
if sequence.is_empty() {
|
||||||
|
Ok(Encoding::new(tk::tokenizer::Encoding::default()))
|
||||||
|
} else {
|
||||||
|
model.tokenize(sequence).map(|tokens| {
|
||||||
|
Encoding::new(tk::tokenizer::Encoding::from_tokens(tokens, type_id))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<Result<_, _>>()
|
||||||
|
}))
|
||||||
|
.into()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// BPE Model
|
/// BPE Model
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
from .. import Tokenizer, Encoding
|
from tokenizers import Tokenizer, Encoding
|
||||||
|
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
|
||||||
|
|
||||||
from typing import List, Union, Tuple, Optional
|
from typing import List, Union, Tuple, Optional
|
||||||
|
|
||||||
@ -139,15 +140,22 @@ class BaseTokenizer:
|
|||||||
return self._tokenizer.normalize(sequence)
|
return self._tokenizer.normalize(sequence)
|
||||||
|
|
||||||
def encode_tokenized(
|
def encode_tokenized(
|
||||||
self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
|
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
|
||||||
) -> Encoding:
|
) -> Encoding:
|
||||||
""" Encode the given tokenized sequence. Let us skip the Normalizer and PreTokenizer
|
""" Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
|
||||||
by providing already tokenized substrings.
|
already tokenized substrings.
|
||||||
|
|
||||||
|
A sequence can either be:
|
||||||
|
- `TokenizedSequence`: (`List[str]`)
|
||||||
|
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||||
|
a Tuple[int, int].
|
||||||
|
|
||||||
|
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||||
|
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence: Union[List[str], List[Tuple[str, Offsets]]]:
|
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
|
||||||
Either a list of strings, or a list of tuples (string, offsets) where offset
|
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
|
||||||
is a tuple (int, int)
|
|
||||||
|
|
||||||
type_id: int:
|
type_id: int:
|
||||||
The type id of the given sequence
|
The type id of the given sequence
|
||||||
@ -157,6 +165,35 @@ class BaseTokenizer:
|
|||||||
"""
|
"""
|
||||||
return self._tokenizer.model.encode(sequence)
|
return self._tokenizer.model.encode(sequence)
|
||||||
|
|
||||||
|
def encode_tokenized_batch(
|
||||||
|
self,
|
||||||
|
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
|
||||||
|
type_id: int = 0,
|
||||||
|
) -> List[Encoding]:
|
||||||
|
""" Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
|
||||||
|
providing already tokenized substrings.
|
||||||
|
|
||||||
|
A sequence can either be:
|
||||||
|
- `TokenizedSequence`: (`List[str]`)
|
||||||
|
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||||
|
a Tuple[int, int].
|
||||||
|
|
||||||
|
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||||
|
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
|
||||||
|
A list of sequence. Each sequence is either a TokenizedSequence or a
|
||||||
|
TokenizedSequenceWithOffsets
|
||||||
|
|
||||||
|
type_id: int:
|
||||||
|
The type if of the given sequence
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of Encoding
|
||||||
|
"""
|
||||||
|
return self._tokenizer.model.encode_batch(sequences)
|
||||||
|
|
||||||
def encode(
|
def encode(
|
||||||
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
|
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
|
||||||
) -> Encoding:
|
) -> Encoding:
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from .. import Encoding
|
from .. import Encoding, Offsets
|
||||||
from typing import List, Optional, Union, Tuple
|
from typing import List, Optional, Union, Tuple
|
||||||
|
|
||||||
Offsets = Tuple[int, int]
|
TokenizedSequence = List[str]
|
||||||
|
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
|
||||||
|
|
||||||
class Model:
|
class Model:
|
||||||
""" Base class for all models
|
""" Base class for all models
|
||||||
@ -19,14 +20,21 @@ class Model:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def encode(
|
def encode(
|
||||||
self, sequence: Union[List[str], List[Tuple[str, Offsets]]], type_id: int = 0
|
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
|
||||||
) -> Encoding:
|
) -> Encoding:
|
||||||
""" Encode the given list of string or tuples (string, offsets)
|
""" Encode the given sequence.
|
||||||
|
|
||||||
|
A sequence can either be:
|
||||||
|
- `TokenizedSequence`: (`List[str]`)
|
||||||
|
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||||
|
a Tuple[int, int].
|
||||||
|
|
||||||
|
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||||
|
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence: Union[List[str], List[Tuple[str, Tuple[int, int]]]]:
|
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
|
||||||
Either a list of strings, or a list of tuples (string, offsets) where offset
|
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
|
||||||
is a tuple (int, int)
|
|
||||||
|
|
||||||
type_id: int:
|
type_id: int:
|
||||||
The type id of the given sequence
|
The type id of the given sequence
|
||||||
@ -35,6 +43,33 @@ class Model:
|
|||||||
An Encoding
|
An Encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
def encode_batch(
|
||||||
|
self,
|
||||||
|
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
|
||||||
|
type_id: int = 0,
|
||||||
|
) -> List[Encoding]:
|
||||||
|
""" Encode the given batch of sequence.
|
||||||
|
|
||||||
|
A sequence can either be:
|
||||||
|
- `TokenizedSequence`: (`List[str]`)
|
||||||
|
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
||||||
|
a Tuple[int, int].
|
||||||
|
|
||||||
|
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
||||||
|
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
|
||||||
|
A list of sequence. Each sequence is either a TokenizedSequence or a
|
||||||
|
TokenizedSequenceWithOffsets
|
||||||
|
|
||||||
|
type_id: int:
|
||||||
|
The type if of the given sequence
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of Encoding
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
class BPE(Model):
|
class BPE(Model):
|
||||||
""" BytePairEncoding model class """
|
""" BytePairEncoding model class """
|
||||||
|
Reference in New Issue
Block a user