mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-31 04:29:21 +00:00
Update Python bindings with new interface
This commit is contained in:
12
bindings/node/native/Cargo.lock
generated
12
bindings/node/native/Cargo.lock
generated
@ -281,6 +281,15 @@ dependencies = [
|
|||||||
"either",
|
"either",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "itertools"
|
||||||
|
version = "0.9.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
|
||||||
|
dependencies = [
|
||||||
|
"either",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "itoa"
|
name = "itoa"
|
||||||
version = "0.4.5"
|
version = "0.4.5"
|
||||||
@ -584,7 +593,7 @@ version = "0.1.0"
|
|||||||
source = "git+https://github.com/n1t0/rayon-cond#c56e4f1ded0fcb92eac70e0533703bba3ca2983f"
|
source = "git+https://github.com/n1t0/rayon-cond#c56e4f1ded0fcb92eac70e0533703bba3ca2983f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"either",
|
"either",
|
||||||
"itertools",
|
"itertools 0.8.2",
|
||||||
"rayon",
|
"rayon",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -749,6 +758,7 @@ version = "0.10.1"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"clap",
|
"clap",
|
||||||
"indicatif",
|
"indicatif",
|
||||||
|
"itertools 0.9.0",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"onig",
|
"onig",
|
||||||
"rand",
|
"rand",
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
|
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
|
||||||
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
|
|
||||||
|
|
||||||
from typing import List, Union, Tuple, Optional, Dict
|
from typing import List, Union, Tuple, Optional, Dict
|
||||||
|
|
||||||
|
@ -2,9 +2,6 @@ from typing import List, Tuple
|
|||||||
|
|
||||||
from .. import models, Offsets
|
from .. import models, Offsets
|
||||||
|
|
||||||
TokenizedSequence = List[str]
|
|
||||||
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
|
|
||||||
|
|
||||||
Model = models.Model
|
Model = models.Model
|
||||||
BPE = models.BPE
|
BPE = models.BPE
|
||||||
WordPiece = models.WordPiece
|
WordPiece = models.WordPiece
|
||||||
|
@ -1,9 +1,6 @@
|
|||||||
from .. import Encoding, Offsets
|
from .. import Encoding, Offsets
|
||||||
from typing import List, Optional, Union, Tuple
|
from typing import List, Optional, Union, Tuple
|
||||||
|
|
||||||
TokenizedSequence = List[str]
|
|
||||||
TokenizedSequenceWithOffsets = List[Tuple[str, Offsets]]
|
|
||||||
|
|
||||||
class Model:
|
class Model:
|
||||||
""" Base class for all models
|
""" Base class for all models
|
||||||
|
|
||||||
@ -19,57 +16,6 @@ class Model:
|
|||||||
Any file with the same name that already exist in this folder will be overwritten.
|
Any file with the same name that already exist in this folder will be overwritten.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def encode(
|
|
||||||
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
|
|
||||||
) -> Encoding:
|
|
||||||
""" Encode the given sequence.
|
|
||||||
|
|
||||||
A sequence can either be:
|
|
||||||
- `TokenizedSequence`: (`List[str]`)
|
|
||||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
|
||||||
a Tuple[int, int].
|
|
||||||
|
|
||||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
|
||||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
|
|
||||||
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
|
|
||||||
|
|
||||||
type_id: int:
|
|
||||||
The type id of the given sequence
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
An Encoding
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
def encode_batch(
|
|
||||||
self,
|
|
||||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
|
|
||||||
type_id: int = 0,
|
|
||||||
) -> List[Encoding]:
|
|
||||||
""" Encode the given batch of sequence.
|
|
||||||
|
|
||||||
A sequence can either be:
|
|
||||||
- `TokenizedSequence`: (`List[str]`)
|
|
||||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
|
||||||
a Tuple[int, int].
|
|
||||||
|
|
||||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
|
||||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
|
|
||||||
A list of sequence. Each sequence is either a TokenizedSequence or a
|
|
||||||
TokenizedSequenceWithOffsets
|
|
||||||
|
|
||||||
type_id: int:
|
|
||||||
The type if of the given sequence
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of Encoding
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class BPE(Model):
|
class BPE(Model):
|
||||||
"""BytePairEncoding model class
|
"""BytePairEncoding model class
|
||||||
|
@ -75,11 +75,7 @@ impl Encoding {
|
|||||||
#[args(growing_offsets = true)]
|
#[args(growing_offsets = true)]
|
||||||
fn merge(encodings: Vec<PyRef<Encoding>>, growing_offsets: bool) -> Encoding {
|
fn merge(encodings: Vec<PyRef<Encoding>>, growing_offsets: bool) -> Encoding {
|
||||||
tk::tokenizer::Encoding::merge(
|
tk::tokenizer::Encoding::merge(
|
||||||
encodings
|
encodings.into_iter().map(|e| e.encoding.clone()),
|
||||||
.into_iter()
|
|
||||||
.map(|e| e.encoding.clone())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
.as_slice(),
|
|
||||||
growing_offsets,
|
growing_offsets,
|
||||||
)
|
)
|
||||||
.into()
|
.into()
|
||||||
|
@ -27,3 +27,8 @@ impl<T> std::convert::Into<PyResult<T>> for ToPyResult<T> {
|
|||||||
.map_err(|e| exceptions::Exception::py_err(format!("{}", e)))
|
.map_err(|e| exceptions::Exception::py_err(format!("{}", e)))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
impl<T> ToPyResult<T> {
|
||||||
|
pub fn into_py(self) -> PyResult<T> {
|
||||||
|
self.into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1,78 +1,11 @@
|
|||||||
extern crate tokenizers as tk;
|
extern crate tokenizers as tk;
|
||||||
|
|
||||||
use super::encoding::Encoding;
|
|
||||||
use super::error::ToPyResult;
|
use super::error::ToPyResult;
|
||||||
use super::utils::Container;
|
use super::utils::Container;
|
||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use tk::parallelism::*;
|
|
||||||
|
|
||||||
#[pyclass]
|
|
||||||
struct EncodeInput {
|
|
||||||
sequence: Vec<(String, (usize, usize))>,
|
|
||||||
}
|
|
||||||
impl EncodeInput {
|
|
||||||
pub fn into_input(self) -> Vec<(String, (usize, usize))> {
|
|
||||||
self.sequence
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'source> FromPyObject<'source> for EncodeInput {
|
|
||||||
fn extract(ob: &'source PyAny) -> PyResult<Self> {
|
|
||||||
let sequence: &PyList = ob.downcast()?;
|
|
||||||
|
|
||||||
enum Mode {
|
|
||||||
NoOffsets,
|
|
||||||
Offsets,
|
|
||||||
};
|
|
||||||
let mode = sequence
|
|
||||||
.iter()
|
|
||||||
.next()
|
|
||||||
.map(|item| {
|
|
||||||
if item.extract::<String>().is_ok() {
|
|
||||||
Ok(Mode::NoOffsets)
|
|
||||||
} else if item.extract::<(String, (usize, usize))>().is_ok() {
|
|
||||||
Ok(Mode::Offsets)
|
|
||||||
} else {
|
|
||||||
Err(exceptions::ValueError::py_err(
|
|
||||||
"Input must be a list[str] or list[(str, (int, int))]",
|
|
||||||
))
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.unwrap()?;
|
|
||||||
|
|
||||||
let mut total_len = 0;
|
|
||||||
let sequence = sequence
|
|
||||||
.iter()
|
|
||||||
.enumerate()
|
|
||||||
.map(|(i, item)| match mode {
|
|
||||||
Mode::NoOffsets => item
|
|
||||||
.extract::<String>()
|
|
||||||
.map_err(|_| {
|
|
||||||
exceptions::ValueError::py_err(format!(
|
|
||||||
"Value at index {} should be a `str`",
|
|
||||||
i
|
|
||||||
))
|
|
||||||
})
|
|
||||||
.map(|s| {
|
|
||||||
let len = s.chars().count();
|
|
||||||
total_len += len;
|
|
||||||
(s, (total_len - len, total_len))
|
|
||||||
}),
|
|
||||||
Mode::Offsets => item.extract::<(String, (usize, usize))>().map_err(|_| {
|
|
||||||
exceptions::ValueError::py_err(format!(
|
|
||||||
"Value at index {} should be a `(str, (int, int))`",
|
|
||||||
i
|
|
||||||
))
|
|
||||||
}),
|
|
||||||
})
|
|
||||||
.collect::<Result<Vec<_>, PyErr>>()?;
|
|
||||||
|
|
||||||
Ok(EncodeInput { sequence })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A Model represents some tokenization algorithm like BPE or Word
|
/// A Model represents some tokenization algorithm like BPE or Word
|
||||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
@ -133,42 +66,6 @@ impl Model {
|
|||||||
.map(|path| path.to_string_lossy().into_owned())
|
.map(|path| path.to_string_lossy().into_owned())
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[args(type_id = 0)]
|
|
||||||
fn encode(&self, sequence: EncodeInput, type_id: u32) -> PyResult<Encoding> {
|
|
||||||
let sequence = sequence.into_input();
|
|
||||||
|
|
||||||
if sequence.is_empty() {
|
|
||||||
return Ok(tk::tokenizer::Encoding::default().into());
|
|
||||||
}
|
|
||||||
|
|
||||||
ToPyResult(self.model.execute(|model| {
|
|
||||||
model
|
|
||||||
.tokenize(sequence)
|
|
||||||
.map(|tokens| tk::tokenizer::Encoding::from_tokens(tokens, type_id).into())
|
|
||||||
}))
|
|
||||||
.into()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[args(type_id = 0)]
|
|
||||||
fn encode_batch(&self, sequences: Vec<EncodeInput>, type_id: u32) -> PyResult<Vec<Encoding>> {
|
|
||||||
ToPyResult(self.model.execute(|model| {
|
|
||||||
sequences
|
|
||||||
.into_maybe_par_iter()
|
|
||||||
.map(|sequence| {
|
|
||||||
let sequence = sequence.into_input();
|
|
||||||
if sequence.is_empty() {
|
|
||||||
Ok(tk::tokenizer::Encoding::default().into())
|
|
||||||
} else {
|
|
||||||
model.tokenize(sequence).map(|tokens| {
|
|
||||||
tk::tokenizer::Encoding::from_tokens(tokens, type_id).into()
|
|
||||||
})
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect::<Result<_, _>>()
|
|
||||||
}))
|
|
||||||
.into()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// BPE Model
|
/// BPE Model
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
extern crate tokenizers as tk;
|
extern crate tokenizers as tk;
|
||||||
|
|
||||||
use super::error::{PyError, ToPyResult};
|
use super::error::ToPyResult;
|
||||||
use super::utils::Container;
|
use super::utils::Container;
|
||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
use tk::tokenizer::Offsets;
|
||||||
use tk::tokenizer::{Offsets, Result};
|
|
||||||
|
|
||||||
#[pyclass(dict, module = "tokenizers.pre_tokenizers")]
|
#[pyclass(dict, module = "tokenizers.pre_tokenizers")]
|
||||||
pub struct PreTokenizer {
|
pub struct PreTokenizer {
|
||||||
@ -14,13 +13,13 @@ pub struct PreTokenizer {
|
|||||||
}
|
}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PreTokenizer {
|
impl PreTokenizer {
|
||||||
#[staticmethod]
|
// #[staticmethod]
|
||||||
fn custom(pretok: PyObject) -> PyResult<Self> {
|
// fn custom(pretok: PyObject) -> PyResult<Self> {
|
||||||
let py_pretok = PyPreTokenizer::new(pretok)?;
|
// let py_pretok = PyPreTokenizer::new(pretok)?;
|
||||||
Ok(PreTokenizer {
|
// Ok(PreTokenizer {
|
||||||
pretok: Container::Owned(Box::new(py_pretok)),
|
// pretok: Container::Owned(Box::new(py_pretok)),
|
||||||
})
|
// })
|
||||||
}
|
// }
|
||||||
|
|
||||||
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
|
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
|
||||||
let data = self
|
let data = self
|
||||||
@ -52,13 +51,20 @@ impl PreTokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
|
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
|
||||||
// TODO: Expose the NormalizedString
|
// TODO: Expose the PreTokenizedString
|
||||||
let mut normalized = tk::tokenizer::NormalizedString::from(s);
|
let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s);
|
||||||
|
|
||||||
ToPyResult(
|
ToPyResult(
|
||||||
self.pretok
|
self.pretok
|
||||||
.execute(|pretok| pretok.pre_tokenize(&mut normalized)),
|
.execute(|pretok| pretok.pre_tokenize(&mut pretokenized)),
|
||||||
)
|
)
|
||||||
.into()
|
.into_py()?;
|
||||||
|
|
||||||
|
Ok(pretokenized
|
||||||
|
.get_normalized(true)
|
||||||
|
.into_iter()
|
||||||
|
.map(|(s, o)| (s.to_owned(), o))
|
||||||
|
.collect())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -108,7 +114,9 @@ impl Whitespace {
|
|||||||
Ok((
|
Ok((
|
||||||
Whitespace {},
|
Whitespace {},
|
||||||
PreTokenizer {
|
PreTokenizer {
|
||||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)),
|
pretok: Container::Owned(Box::new(
|
||||||
|
tk::pre_tokenizers::whitespace::Whitespace::default(),
|
||||||
|
)),
|
||||||
},
|
},
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
@ -209,64 +217,64 @@ impl Metaspace {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct PyPreTokenizer {
|
// struct PyPreTokenizer {
|
||||||
class: PyObject,
|
// class: PyObject,
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
impl PyPreTokenizer {
|
// impl PyPreTokenizer {
|
||||||
pub fn new(class: PyObject) -> PyResult<Self> {
|
// pub fn new(class: PyObject) -> PyResult<Self> {
|
||||||
Ok(PyPreTokenizer { class })
|
// Ok(PyPreTokenizer { class })
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
#[typetag::serde]
|
// #[typetag::serde]
|
||||||
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
// impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
||||||
fn pre_tokenize(
|
// fn pre_tokenize(
|
||||||
&self,
|
// &self,
|
||||||
sentence: &mut tk::tokenizer::NormalizedString,
|
// sentence: &mut tk::tokenizer::NormalizedString,
|
||||||
) -> Result<Vec<(String, Offsets)>> {
|
// ) -> Result<Vec<(String, Offsets)>> {
|
||||||
let gil = Python::acquire_gil();
|
// let gil = Python::acquire_gil();
|
||||||
let py = gil.python();
|
// let py = gil.python();
|
||||||
|
//
|
||||||
let args = PyTuple::new(py, &[sentence.get()]);
|
// let args = PyTuple::new(py, &[sentence.get()]);
|
||||||
match self.class.call_method(py, "pre_tokenize", args, None) {
|
// match self.class.call_method(py, "pre_tokenize", args, None) {
|
||||||
Ok(res) => Ok(res
|
// Ok(res) => Ok(res
|
||||||
.cast_as::<PyList>(py)
|
// .cast_as::<PyList>(py)
|
||||||
.map_err(|_| {
|
// .map_err(|_| {
|
||||||
PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
|
// PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
|
||||||
})?
|
// })?
|
||||||
.extract::<Vec<(String, Offsets)>>()
|
// .extract::<Vec<(String, Offsets)>>()
|
||||||
.map_err(|_| {
|
// .map_err(|_| {
|
||||||
PyError::from(
|
// PyError::from(
|
||||||
"`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
|
// "`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
|
||||||
)
|
// )
|
||||||
})?),
|
// })?),
|
||||||
Err(e) => {
|
// Err(e) => {
|
||||||
e.print(py);
|
// e.print(py);
|
||||||
Err(Box::new(PyError::from(
|
// Err(Box::new(PyError::from(
|
||||||
"Error while calling `pre_tokenize`",
|
// "Error while calling `pre_tokenize`",
|
||||||
)))
|
// )))
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
impl Serialize for PyPreTokenizer {
|
// impl Serialize for PyPreTokenizer {
|
||||||
fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
|
// fn serialize<S>(&self, _serializer: S) -> std::result::Result<S::Ok, S::Error>
|
||||||
where
|
// where
|
||||||
S: Serializer,
|
// S: Serializer,
|
||||||
{
|
// {
|
||||||
Err(serde::ser::Error::custom(
|
// Err(serde::ser::Error::custom(
|
||||||
"Custom PyPreTokenizer cannot be serialized",
|
// "Custom PyPreTokenizer cannot be serialized",
|
||||||
))
|
// ))
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
impl<'de> Deserialize<'de> for PyPreTokenizer {
|
// impl<'de> Deserialize<'de> for PyPreTokenizer {
|
||||||
fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
|
// fn deserialize<D>(_deserializer: D) -> std::result::Result<Self, D::Error>
|
||||||
where
|
// where
|
||||||
D: Deserializer<'de>,
|
// D: Deserializer<'de>,
|
||||||
{
|
// {
|
||||||
unimplemented!("PyPreTokenizer cannot be deserialized")
|
// unimplemented!("PyPreTokenizer cannot be deserialized")
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
Reference in New Issue
Block a user