Files
tokenizers/bindings/python/src/tokenizer.rs
Funtowicz Morgan b4fcc9ce6e Makes decode and decode_batch work on borrowed content. (#1251)
* Makes `decode` and `decode_batch` work on borrowed content.

* Make `decode_batch` work with borrowed content.

* Fix lint.

* Attempt to map it into Node.

* Second attempt.

* Step by step.

* One more step.

* Fix lint.

* Please ...

* Removing collect.

* Revert "Removing collect."

This reverts commit 2f7ec04dc84df3cc5488625a4fcb492fdc3545e2.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2023-05-17 11:18:15 +02:00

1370 lines
50 KiB
Rust

use std::collections::{hash_map::DefaultHasher, HashMap};
use std::hash::{Hash, Hasher};
use numpy::{npyffi, PyArray1};
use pyo3::class::basic::CompareOp;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use pyo3::AsPyPointer;
use tk::models::bpe::BPE;
use tk::tokenizer::{
Model, PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl,
TruncationDirection, TruncationParams, TruncationStrategy,
};
use tk::utils::iter::ResultShunt;
use tokenizers as tk;
use super::decoders::PyDecoder;
use super::encoding::PyEncoding;
use super::error::{PyError, ToPyResult};
use super::models::PyModel;
use super::normalizers::PyNormalizer;
use super::pre_tokenizers::PyPreTokenizer;
use super::trainers::PyTrainer;
use crate::processors::PyPostProcessor;
use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
/// Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`.
/// It can have special options that defines the way it should behave.
///
/// Args:
/// content (:obj:`str`): The content of the token
///
/// single_word (:obj:`bool`, defaults to :obj:`False`):
/// Defines whether this token should only match single words. If :obj:`True`, this
/// token will never match inside of a word. For example the token ``ing`` would match
/// on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`.
/// The notion of "`inside of a word`" is defined by the word boundaries pattern in
/// regular expressions (ie. the token should start and end with word boundaries).
///
/// lstrip (:obj:`bool`, defaults to :obj:`False`):
/// Defines whether this token should strip all potential whitespaces on its left side.
/// If :obj:`True`, this token will greedily match any whitespace on its left. For
/// example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text
/// ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left).
///
/// rstrip (:obj:`bool`, defaults to :obj:`False`):
/// Defines whether this token should strip all potential whitespaces on its right
/// side. If :obj:`True`, this token will greedily match any whitespace on its right.
/// It works just like :obj:`lstrip` but on the right.
///
/// normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`):
/// Defines whether this token should match against the normalized version of the input
/// text. For example, with the added token ``"yesterday"``, and a normalizer in charge of
/// lowercasing the text, the token could be extract from the input ``"I saw a lion
/// Yesterday"``.
///
#[pyclass(dict, module = "tokenizers", name = "AddedToken")]
#[pyo3(
text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"
)]
pub struct PyAddedToken {
pub content: String,
pub is_special_token: bool,
pub single_word: Option<bool>,
pub lstrip: Option<bool>,
pub rstrip: Option<bool>,
pub normalized: Option<bool>,
}
impl PyAddedToken {
pub fn from<S: Into<String>>(content: S, is_special_token: Option<bool>) -> Self {
Self {
content: content.into(),
is_special_token: is_special_token.unwrap_or(false),
single_word: None,
lstrip: None,
rstrip: None,
normalized: None,
}
}
pub fn get_token(&self) -> tk::tokenizer::AddedToken {
let mut token = tk::AddedToken::from(&self.content, self.is_special_token);
if let Some(sw) = self.single_word {
token = token.single_word(sw);
}
if let Some(ls) = self.lstrip {
token = token.lstrip(ls);
}
if let Some(rs) = self.rstrip {
token = token.rstrip(rs);
}
if let Some(n) = self.normalized {
token = token.normalized(n);
}
token
}
pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> {
let dict = PyDict::new(py);
let token = self.get_token();
dict.set_item("content", token.content)?;
dict.set_item("single_word", token.single_word)?;
dict.set_item("lstrip", token.lstrip)?;
dict.set_item("rstrip", token.rstrip)?;
dict.set_item("normalized", token.normalized)?;
Ok(dict)
}
}
impl From<tk::AddedToken> for PyAddedToken {
fn from(token: tk::AddedToken) -> Self {
Self {
content: token.content,
single_word: Some(token.single_word),
lstrip: Some(token.lstrip),
rstrip: Some(token.rstrip),
normalized: Some(token.normalized),
is_special_token: !token.normalized,
}
}
}
#[pymethods]
impl PyAddedToken {
#[new]
#[pyo3(signature = (content=None, **kwargs))]
fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
let mut token = PyAddedToken::from(content.unwrap_or(""), None);
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"single_word" => token.single_word = Some(value.extract()?),
"lstrip" => token.lstrip = Some(value.extract()?),
"rstrip" => token.rstrip = Some(value.extract()?),
"normalized" => token.normalized = Some(value.extract()?),
_ => println!("Ignored unknown kwarg option {}", key),
}
}
}
Ok(token)
}
fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> {
self.as_pydict(py)
}
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
match state.extract::<&PyDict>(py) {
Ok(state) => {
for (key, value) in state {
let key: &str = key.extract()?;
match key {
"content" => self.content = value.extract()?,
"single_word" => self.single_word = Some(value.extract()?),
"lstrip" => self.lstrip = Some(value.extract()?),
"rstrip" => self.rstrip = Some(value.extract()?),
"normalized" => self.normalized = Some(value.extract()?),
_ => {}
}
}
Ok(())
}
Err(e) => Err(e),
}
}
/// Get the content of this :obj:`AddedToken`
#[getter]
fn get_content(&self) -> &str {
&self.content
}
/// Get the value of the :obj:`rstrip` option
#[getter]
fn get_rstrip(&self) -> bool {
self.get_token().rstrip
}
/// Get the value of the :obj:`lstrip` option
#[getter]
fn get_lstrip(&self) -> bool {
self.get_token().lstrip
}
/// Get the value of the :obj:`single_word` option
#[getter]
fn get_single_word(&self) -> bool {
self.get_token().single_word
}
/// Get the value of the :obj:`normalized` option
#[getter]
fn get_normalized(&self) -> bool {
self.get_token().normalized
}
fn __str__(&self) -> PyResult<&str> {
Ok(&self.content)
}
fn __repr__(&self) -> PyResult<String> {
let bool_to_python = |p| match p {
true => "True",
false => "False",
};
let token = self.get_token();
Ok(format!(
"AddedToken(\"{}\", rstrip={}, lstrip={}, single_word={}, normalized={})",
self.content,
bool_to_python(token.rstrip),
bool_to_python(token.lstrip),
bool_to_python(token.single_word),
bool_to_python(token.normalized)
))
}
fn __richcmp__(&self, other: Py<PyAddedToken>, op: CompareOp) -> bool {
use CompareOp::*;
Python::with_gil(|py| match op {
Lt | Le | Gt | Ge => false,
Eq => self.get_token() == other.borrow(py).get_token(),
Ne => self.get_token() != other.borrow(py).get_token(),
})
}
fn __hash__(&self) -> u64 {
let mut hasher = DefaultHasher::new();
self.get_token().hash(&mut hasher);
hasher.finish()
}
}
struct TextInputSequence<'s>(tk::InputSequence<'s>);
impl<'s> FromPyObject<'s> for TextInputSequence<'s> {
fn extract(ob: &'s PyAny) -> PyResult<Self> {
let err = exceptions::PyTypeError::new_err("TextInputSequence must be str");
if let Ok(s) = ob.downcast::<PyString>() {
Ok(Self(s.to_string_lossy().into()))
} else {
Err(err)
}
}
}
impl<'s> From<TextInputSequence<'s>> for tk::InputSequence<'s> {
fn from(s: TextInputSequence<'s>) -> Self {
s.0
}
}
struct PyArrayUnicode(Vec<String>);
impl FromPyObject<'_> for PyArrayUnicode {
fn extract(ob: &PyAny) -> PyResult<Self> {
// SAFETY Making sure the pointer is a valid numpy array requires calling numpy C code
if unsafe { npyffi::PyArray_Check(ob.py(), ob.as_ptr()) } == 0 {
return Err(exceptions::PyTypeError::new_err("Expected an np.array"));
}
let arr = ob.as_ptr() as *mut npyffi::PyArrayObject;
// SAFETY Getting all the metadata about the numpy array to check its sanity
let (type_num, elsize, alignment, data, nd, flags) = unsafe {
let desc = (*arr).descr;
(
(*desc).type_num,
(*desc).elsize as usize,
(*desc).alignment as usize,
(*arr).data,
(*arr).nd,
(*arr).flags,
)
};
if nd != 1 {
return Err(exceptions::PyTypeError::new_err(
"Expected a 1 dimensional np.array",
));
}
if flags & (npyffi::NPY_ARRAY_C_CONTIGUOUS | npyffi::NPY_ARRAY_F_CONTIGUOUS) == 0 {
return Err(exceptions::PyTypeError::new_err(
"Expected a contiguous np.array",
));
}
if type_num != npyffi::types::NPY_TYPES::NPY_UNICODE as i32 {
return Err(exceptions::PyTypeError::new_err(
"Expected a np.array[dtype='U']",
));
}
// SAFETY Looking at the raw numpy data to create new owned Rust strings via copies (so it's safe afterwards).
unsafe {
let n_elem = *(*arr).dimensions as usize;
let all_bytes = std::slice::from_raw_parts(data as *const u8, elsize * n_elem);
let seq = (0..n_elem)
.map(|i| {
let bytes = &all_bytes[i * elsize..(i + 1) * elsize];
let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
pyo3::ffi::PyUnicode_4BYTE_KIND as _,
bytes.as_ptr() as *const _,
elsize as isize / alignment as isize,
);
let py = ob.py();
let obj = PyObject::from_owned_ptr(py, unicode);
let s = obj.downcast::<PyString>(py)?;
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
})
.collect::<PyResult<Vec<_>>>()?;
Ok(Self(seq))
}
}
}
impl From<PyArrayUnicode> for tk::InputSequence<'_> {
fn from(s: PyArrayUnicode) -> Self {
s.0.into()
}
}
struct PyArrayStr(Vec<String>);
impl FromPyObject<'_> for PyArrayStr {
fn extract(ob: &PyAny) -> PyResult<Self> {
let array = ob.downcast::<PyArray1<PyObject>>()?;
let seq = array
.readonly()
.as_array()
.iter()
.map(|obj| {
let s = obj.downcast::<PyString>(ob.py())?;
Ok(s.to_string_lossy().into_owned())
})
.collect::<PyResult<Vec<_>>>()?;
Ok(Self(seq))
}
}
impl From<PyArrayStr> for tk::InputSequence<'_> {
fn from(s: PyArrayStr) -> Self {
s.0.into()
}
}
struct PreTokenizedInputSequence<'s>(tk::InputSequence<'s>);
impl<'s> FromPyObject<'s> for PreTokenizedInputSequence<'s> {
fn extract(ob: &'s PyAny) -> PyResult<Self> {
if let Ok(seq) = ob.extract::<PyArrayUnicode>() {
return Ok(Self(seq.into()));
}
if let Ok(seq) = ob.extract::<PyArrayStr>() {
return Ok(Self(seq.into()));
}
if let Ok(s) = ob.downcast::<PyList>() {
if let Ok(seq) = s.extract::<Vec<&str>>() {
return Ok(Self(seq.into()));
}
}
if let Ok(s) = ob.downcast::<PyTuple>() {
if let Ok(seq) = s.extract::<Vec<&str>>() {
return Ok(Self(seq.into()));
}
}
Err(exceptions::PyTypeError::new_err(
"PreTokenizedInputSequence must be Union[List[str], Tuple[str]]",
))
}
}
impl<'s> From<PreTokenizedInputSequence<'s>> for tk::InputSequence<'s> {
fn from(s: PreTokenizedInputSequence<'s>) -> Self {
s.0
}
}
struct TextEncodeInput<'s>(tk::EncodeInput<'s>);
impl<'s> FromPyObject<'s> for TextEncodeInput<'s> {
fn extract(ob: &'s PyAny) -> PyResult<Self> {
if let Ok(i) = ob.extract::<TextInputSequence>() {
return Ok(Self(i.into()));
}
if let Ok((i1, i2)) = ob.extract::<(TextInputSequence, TextInputSequence)>() {
return Ok(Self((i1, i2).into()));
}
if let Ok(arr) = ob.extract::<Vec<&PyAny>>() {
if arr.len() == 2 {
let first = arr[0].extract::<TextInputSequence>()?;
let second = arr[1].extract::<TextInputSequence>()?;
return Ok(Self((first, second).into()));
}
}
Err(exceptions::PyTypeError::new_err(
"TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]",
))
}
}
impl<'s> From<TextEncodeInput<'s>> for tk::tokenizer::EncodeInput<'s> {
fn from(i: TextEncodeInput<'s>) -> Self {
i.0
}
}
struct PreTokenizedEncodeInput<'s>(tk::EncodeInput<'s>);
impl<'s> FromPyObject<'s> for PreTokenizedEncodeInput<'s> {
fn extract(ob: &'s PyAny) -> PyResult<Self> {
if let Ok(i) = ob.extract::<PreTokenizedInputSequence>() {
return Ok(Self(i.into()));
}
if let Ok((i1, i2)) = ob.extract::<(PreTokenizedInputSequence, PreTokenizedInputSequence)>()
{
return Ok(Self((i1, i2).into()));
}
if let Ok(arr) = ob.extract::<Vec<&PyAny>>() {
if arr.len() == 2 {
let first = arr[0].extract::<PreTokenizedInputSequence>()?;
let second = arr[1].extract::<PreTokenizedInputSequence>()?;
return Ok(Self((first, second).into()));
}
}
Err(exceptions::PyTypeError::new_err(
"PreTokenizedEncodeInput must be Union[PreTokenizedInputSequence, \
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]]",
))
}
}
impl<'s> From<PreTokenizedEncodeInput<'s>> for tk::tokenizer::EncodeInput<'s> {
fn from(i: PreTokenizedEncodeInput<'s>) -> Self {
i.0
}
}
type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProcessor, PyDecoder>;
/// A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input
/// and outputs an :class:`~tokenizers.Encoding`.
///
/// Args:
/// model (:class:`~tokenizers.models.Model`):
/// The core algorithm that this :obj:`Tokenizer` should be using.
///
#[pyclass(dict, module = "tokenizers", name = "Tokenizer")]
#[pyo3(text_signature = "(self, model)")]
#[derive(Clone)]
pub struct PyTokenizer {
tokenizer: Tokenizer,
}
impl PyTokenizer {
fn new(tokenizer: Tokenizer) -> Self {
PyTokenizer { tokenizer }
}
fn from_model(model: PyModel) -> Self {
PyTokenizer::new(TokenizerImpl::new(model))
}
}
#[pymethods]
impl PyTokenizer {
#[new]
fn __new__(model: PyRef<PyModel>) -> Self {
PyTokenizer::from_model(model.clone())
}
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
let data = serde_json::to_string(&self.tokenizer).map_err(|e| {
exceptions::PyException::new_err(format!(
"Error while attempting to pickle Tokenizer: {}",
e
))
})?;
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
}
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
match state.extract::<&PyBytes>(py) {
Ok(s) => {
self.tokenizer = serde_json::from_slice(s.as_bytes()).map_err(|e| {
exceptions::PyException::new_err(format!(
"Error while attempting to unpickle Tokenizer: {}",
e
))
})?;
Ok(())
}
Err(e) => Err(e),
}
}
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
let model = PyModel::from(BPE::default()).into_py(py);
PyTuple::new(py, vec![model])
}
/// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
///
/// Args:
/// json (:obj:`str`):
/// A valid JSON string representing a previously serialized
/// :class:`~tokenizers.Tokenizer`
///
/// Returns:
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
#[staticmethod]
#[pyo3(text_signature = "(json)")]
fn from_str(json: &str) -> PyResult<Self> {
let tokenizer: PyResult<_> = ToPyResult(json.parse()).into();
Ok(Self::new(tokenizer?))
}
/// Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path.
///
/// Args:
/// path (:obj:`str`):
/// A path to a local JSON file representing a previously serialized
/// :class:`~tokenizers.Tokenizer`
///
/// Returns:
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
#[staticmethod]
#[pyo3(text_signature = "(path)")]
fn from_file(path: &str) -> PyResult<Self> {
let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into();
Ok(Self::new(tokenizer?))
}
/// Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer.
///
/// Args:
/// buffer (:obj:`bytes`):
/// A buffer containing a previously serialized :class:`~tokenizers.Tokenizer`
///
/// Returns:
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
#[staticmethod]
#[pyo3(text_signature = "(buffer)")]
fn from_buffer(buffer: &PyBytes) -> PyResult<Self> {
let tokenizer = serde_json::from_slice(buffer.as_bytes()).map_err(|e| {
exceptions::PyValueError::new_err(format!(
"Cannot instantiate Tokenizer from buffer: {}",
e
))
})?;
Ok(Self { tokenizer })
}
/// Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the
/// Hugging Face Hub.
///
/// Args:
/// identifier (:obj:`str`):
/// The identifier of a Model on the Hugging Face Hub, that contains
/// a tokenizer.json file
/// revision (:obj:`str`, defaults to `main`):
/// A branch or commit id
/// auth_token (:obj:`str`, `optional`, defaults to `None`):
/// An optional auth token used to access private repositories on the
/// Hugging Face Hub
///
/// Returns:
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
#[staticmethod]
#[pyo3(signature = (identifier, revision = String::from("main"), auth_token = None))]
#[pyo3(text_signature = "(identifier, revision=\"main\", auth_token=None)")]
fn from_pretrained(
identifier: &str,
revision: String,
auth_token: Option<String>,
) -> PyResult<Self> {
let params = tk::FromPretrainedParameters {
revision,
auth_token,
user_agent: [("bindings", "Python"), ("version", crate::VERSION)]
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect(),
};
let tokenizer: PyResult<_> =
ToPyResult(Tokenizer::from_pretrained(identifier, Some(params))).into();
Ok(Self::new(tokenizer?))
}
/// Gets a serialized string representing this :class:`~tokenizers.Tokenizer`.
///
/// Args:
/// pretty (:obj:`bool`, defaults to :obj:`False`):
/// Whether the JSON string should be pretty formatted.
///
/// Returns:
/// :obj:`str`: A string representing the serialized Tokenizer
#[pyo3(signature = (pretty = false))]
#[pyo3(text_signature = "(self, pretty=False)")]
fn to_str(&self, pretty: bool) -> PyResult<String> {
ToPyResult(self.tokenizer.to_string(pretty)).into()
}
/// Save the :class:`~tokenizers.Tokenizer` to the file at the given path.
///
/// Args:
/// path (:obj:`str`):
/// A path to a file in which to save the serialized tokenizer.
///
/// pretty (:obj:`bool`, defaults to :obj:`True`):
/// Whether the JSON file should be pretty formatted.
#[pyo3(signature = (path, pretty = true))]
#[pyo3(text_signature = "(self, path, pretty=True)")]
fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
ToPyResult(self.tokenizer.save(path, pretty)).into()
}
/// Return the number of special tokens that would be added for single/pair sentences.
/// :param is_pair: Boolean indicating if the input would be a single sentence or a pair
/// :return:
#[pyo3(text_signature = "(self, is_pair)")]
fn num_special_tokens_to_add(&self, is_pair: bool) -> usize {
self.tokenizer
.get_post_processor()
.map_or(0, |p| p.added_tokens(is_pair))
}
/// Get the underlying vocabulary
///
/// Args:
/// with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to include the added tokens
///
/// Returns:
/// :obj:`Dict[str, int]`: The vocabulary
#[pyo3(signature = (with_added_tokens = true))]
#[pyo3(text_signature = "(self, with_added_tokens=True)")]
fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32> {
self.tokenizer.get_vocab(with_added_tokens)
}
/// Get the size of the underlying vocabulary
///
/// Args:
/// with_added_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to include the added tokens
///
/// Returns:
/// :obj:`int`: The size of the vocabulary
#[pyo3(signature = (with_added_tokens = true))]
#[pyo3(text_signature = "(self, with_added_tokens=True)")]
fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
self.tokenizer.get_vocab_size(with_added_tokens)
}
/// Enable truncation
///
/// Args:
/// max_length (:obj:`int`):
/// The max length at which to truncate
///
/// stride (:obj:`int`, `optional`):
/// The length of the previous first sequence to be included in the overflowing
/// sequence
///
/// strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`):
/// The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or
/// ``only_second``.
///
/// direction (:obj:`str`, defaults to :obj:`right`):
/// Truncate direction
#[pyo3(signature = (max_length, **kwargs))]
#[pyo3(
text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
)]
fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut params = TruncationParams {
max_length,
..Default::default()
};
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"stride" => params.stride = value.extract()?,
"strategy" => {
let value: &str = value.extract()?;
params.strategy = match value {
"longest_first" => Ok(TruncationStrategy::LongestFirst),
"only_first" => Ok(TruncationStrategy::OnlyFirst),
"only_second" => Ok(TruncationStrategy::OnlySecond),
_ => Err(PyError(format!(
"Unknown `strategy`: `{}`. Use \
one of `longest_first`, `only_first`, or `only_second`",
value
))
.into_pyerr::<exceptions::PyValueError>()),
}?
}
"direction" => {
let value: &str = value.extract()?;
params.direction = match value {
"left" => Ok(TruncationDirection::Left),
"right" => Ok(TruncationDirection::Right),
_ => Err(PyError(format!(
"Unknown `direction`: `{}`. Use \
one of `left` or `right`.",
value
))
.into_pyerr::<exceptions::PyValueError>()),
}?
}
_ => println!("Ignored unknown kwarg option {}", key),
}
}
}
self.tokenizer.with_truncation(Some(params));
Ok(())
}
/// Disable truncation
#[pyo3(text_signature = "(self)")]
fn no_truncation(&mut self) {
self.tokenizer.with_truncation(None);
}
/// Get the currently set truncation parameters
///
/// `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead`
///
/// Returns:
/// (:obj:`dict`, `optional`):
/// A dict with the current truncation parameters if truncation is enabled
#[getter]
fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyDict>> {
self.tokenizer.get_truncation().map_or(Ok(None), |params| {
let dict = PyDict::new(py);
dict.set_item("max_length", params.max_length)?;
dict.set_item("stride", params.stride)?;
dict.set_item("strategy", params.strategy.as_ref())?;
dict.set_item("direction", params.direction.as_ref())?;
Ok(Some(dict))
})
}
/// Enable the padding
///
/// Args:
/// direction (:obj:`str`, `optional`, defaults to :obj:`right`):
/// The direction in which to pad. Can be either ``right`` or ``left``
///
/// pad_to_multiple_of (:obj:`int`, `optional`):
/// If specified, the padding length should always snap to the next multiple of the
/// given value. For example if we were going to pad witha length of 250 but
/// ``pad_to_multiple_of=8`` then we will pad to 256.
///
/// pad_id (:obj:`int`, defaults to 0):
/// The id to be used when padding
///
/// pad_type_id (:obj:`int`, defaults to 0):
/// The type id to be used when padding
///
/// pad_token (:obj:`str`, defaults to :obj:`[PAD]`):
/// The pad token to be used when padding
///
/// length (:obj:`int`, `optional`):
/// If specified, the length at which to pad. If not specified we pad using the size of
/// the longest sequence in a batch.
#[pyo3(signature = (**kwargs))]
#[pyo3(
text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
)]
fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut params = PaddingParams::default();
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"direction" => {
let value: &str = value.extract()?;
params.direction = match value {
"left" => Ok(PaddingDirection::Left),
"right" => Ok(PaddingDirection::Right),
other => Err(PyError(format!(
"Unknown `direction`: `{}`. Use \
one of `left` or `right`",
other
))
.into_pyerr::<exceptions::PyValueError>()),
}?;
}
"pad_to_multiple_of" => {
if let Some(multiple) = value.extract()? {
params.pad_to_multiple_of = multiple;
}
}
"pad_id" => params.pad_id = value.extract()?,
"pad_type_id" => params.pad_type_id = value.extract()?,
"pad_token" => params.pad_token = value.extract()?,
"max_length" => {
println!(
"enable_padding(max_length=X) is deprecated, \
use enable_padding(length=X) instead"
);
if let Some(l) = value.extract()? {
params.strategy = PaddingStrategy::Fixed(l);
} else {
params.strategy = PaddingStrategy::BatchLongest;
}
}
"length" => {
if let Some(l) = value.extract()? {
params.strategy = PaddingStrategy::Fixed(l);
} else {
params.strategy = PaddingStrategy::BatchLongest;
}
}
_ => println!("Ignored unknown kwarg option {}", key),
}
}
}
self.tokenizer.with_padding(Some(params));
Ok(())
}
/// Disable padding
#[pyo3(text_signature = "(self)")]
fn no_padding(&mut self) {
self.tokenizer.with_padding(None);
}
/// Get the current padding parameters
///
/// `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead`
///
/// Returns:
/// (:obj:`dict`, `optional`):
/// A dict with the current padding parameters if padding is enabled
#[getter]
fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyDict>> {
self.tokenizer.get_padding().map_or(Ok(None), |params| {
let dict = PyDict::new(py);
dict.set_item(
"length",
match params.strategy {
tk::PaddingStrategy::BatchLongest => None,
tk::PaddingStrategy::Fixed(size) => Some(size),
},
)?;
dict.set_item("pad_to_multiple_of", params.pad_to_multiple_of)?;
dict.set_item("pad_id", params.pad_id)?;
dict.set_item("pad_token", &params.pad_token)?;
dict.set_item("pad_type_id", params.pad_type_id)?;
dict.set_item("direction", params.direction.as_ref())?;
Ok(Some(dict))
})
}
/// Encode the given sequence and pair. This method can process raw text sequences
/// as well as already pre-tokenized sequences.
///
/// Example:
/// Here are some examples of the inputs that are accepted::
///
/// encode("A single sequence")`
/// encode("A sequence", "And its pair")`
/// encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)`
/// encode(
/// [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ],
/// is_pretokenized=True
/// )
///
/// Args:
/// sequence (:obj:`~tokenizers.InputSequence`):
/// The main input sequence we want to encode. This sequence can be either raw
/// text or pre-tokenized, according to the ``is_pretokenized`` argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence`
///
/// pair (:obj:`~tokenizers.InputSequence`, `optional`):
/// An optional input sequence. The expected format is the same that for ``sequence``.
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
///
/// Returns:
/// :class:`~tokenizers.Encoding`: The encoded result
///
#[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true))]
#[pyo3(
text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"
)]
fn encode(
&self,
sequence: &PyAny,
pair: Option<&PyAny>,
is_pretokenized: bool,
add_special_tokens: bool,
) -> PyResult<PyEncoding> {
let sequence: tk::InputSequence = if is_pretokenized {
sequence.extract::<PreTokenizedInputSequence>()?.into()
} else {
sequence.extract::<TextInputSequence>()?.into()
};
let input = match pair {
Some(pair) => {
let pair: tk::InputSequence = if is_pretokenized {
pair.extract::<PreTokenizedInputSequence>()?.into()
} else {
pair.extract::<TextInputSequence>()?.into()
};
tk::EncodeInput::Dual(sequence, pair)
}
None => tk::EncodeInput::Single(sequence),
};
ToPyResult(
self.tokenizer
.encode_char_offsets(input, add_special_tokens)
.map(|e| e.into()),
)
.into()
}
/// Encode the given batch of inputs. This method accept both raw text sequences
/// as well as already pre-tokenized sequences.
///
/// Example:
/// Here are some examples of the inputs that are accepted::
///
/// encode_batch([
/// "A single sequence",
/// ("A tuple with a sequence", "And its pair"),
/// [ "A", "pre", "tokenized", "sequence" ],
/// ([ "A", "pre", "tokenized", "sequence" ], "And its pair")
/// ])
///
/// Args:
/// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`):
/// A list of single sequences or pair sequences to encode. Each sequence
/// can be either raw text or pre-tokenized, according to the ``is_pretokenized``
/// argument:
///
/// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput`
/// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput`
///
/// is_pretokenized (:obj:`bool`, defaults to :obj:`False`):
/// Whether the input is already pre-tokenized
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add the special tokens
///
/// Returns:
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
///
#[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
fn encode_batch(
&self,
py: Python<'_>,
input: Vec<&PyAny>,
is_pretokenized: bool,
add_special_tokens: bool,
) -> PyResult<Vec<PyEncoding>> {
let input: Vec<tk::EncodeInput> = input
.into_iter()
.map(|o| {
let input: tk::EncodeInput = if is_pretokenized {
o.extract::<PreTokenizedEncodeInput>()?.into()
} else {
o.extract::<TextEncodeInput>()?.into()
};
Ok(input)
})
.collect::<PyResult<Vec<tk::EncodeInput>>>()?;
py.allow_threads(|| {
ToPyResult(
self.tokenizer
.encode_batch_char_offsets(input, add_special_tokens)
.map(|encodings| encodings.into_iter().map(|e| e.into()).collect()),
)
.into()
})
}
/// Decode the given list of ids back to a string
///
/// This is used to decode anything coming back from a Language Model
///
/// Args:
/// ids (A :obj:`List/Tuple` of :obj:`int`):
/// The list of ids that we want to decode
///
/// skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether the special tokens should be removed from the decoded string
///
/// Returns:
/// :obj:`str`: The decoded string
#[pyo3(signature = (ids, skip_special_tokens = true))]
#[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")]
fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
ToPyResult(self.tokenizer.decode(&ids, skip_special_tokens)).into()
}
/// Decode a batch of ids back to their corresponding string
///
/// Args:
/// sequences (:obj:`List` of :obj:`List[int]`):
/// The batch of sequences we want to decode
///
/// skip_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether the special tokens should be removed from the decoded strings
///
/// Returns:
/// :obj:`List[str]`: A list of decoded strings
#[pyo3(signature = (sequences, skip_special_tokens = true))]
#[pyo3(text_signature = "(self, sequences, skip_special_tokens=True)")]
fn decode_batch(
&self,
py: Python<'_>,
sequences: Vec<Vec<u32>>,
skip_special_tokens: bool,
) -> PyResult<Vec<String>> {
py.allow_threads(|| {
let slices = sequences.iter().map(|v| &v[..]).collect::<Vec<&[u32]>>();
ToPyResult(self.tokenizer.decode_batch(&slices, skip_special_tokens)).into()
})
}
/// Convert the given token to its corresponding id if it exists
///
/// Args:
/// token (:obj:`str`):
/// The token to convert
///
/// Returns:
/// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary
#[pyo3(text_signature = "(self, token)")]
fn token_to_id(&self, token: &str) -> Option<u32> {
self.tokenizer.token_to_id(token)
}
/// Convert the given id to its corresponding token if it exists
///
/// Args:
/// id (:obj:`int`):
/// The id to convert
///
/// Returns:
/// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary
#[pyo3(text_signature = "(self, id)")]
fn id_to_token(&self, id: u32) -> Option<String> {
self.tokenizer.id_to_token(id)
}
/// Add the given tokens to the vocabulary
///
/// The given tokens are added only if they don't already exist in the vocabulary.
/// Each token then gets a new attributed id.
///
/// Args:
/// tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
/// The list of tokens we want to add to the vocabulary. Each token can be either a
/// string or an instance of :class:`~tokenizers.AddedToken` for more customization.
///
/// Returns:
/// :obj:`int`: The number of tokens that were created in the vocabulary
#[pyo3(text_signature = "(self, tokens)")]
fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
let tokens = tokens
.into_iter()
.map(|token| {
if let Ok(content) = token.extract::<String>() {
Ok(PyAddedToken::from(content, Some(false)).get_token())
} else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
token.is_special_token = false;
Ok(token.get_token())
} else {
Err(exceptions::PyTypeError::new_err(
"Input must be a List[Union[str, AddedToken]]",
))
}
})
.collect::<PyResult<Vec<_>>>()?;
Ok(self.tokenizer.add_tokens(&tokens))
}
/// Add the given special tokens to the Tokenizer.
///
/// If these tokens are already part of the vocabulary, it just let the Tokenizer know about
/// them. If they don't exist, the Tokenizer creates them, giving them a new id.
///
/// These special tokens will never be processed by the model (ie won't be split into
/// multiple tokens), and they can be removed from the output when decoding.
///
/// Args:
/// tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`):
/// The list of special tokens we want to add to the vocabulary. Each token can either
/// be a string or an instance of :class:`~tokenizers.AddedToken` for more
/// customization.
///
/// Returns:
/// :obj:`int`: The number of tokens that were created in the vocabulary
#[pyo3(text_signature = "(self, tokens)")]
fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
let tokens = tokens
.into_iter()
.map(|token| {
if let Ok(content) = token.extract::<String>() {
Ok(tk::tokenizer::AddedToken::from(content, true))
} else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
token.is_special_token = true;
Ok(token.get_token())
} else {
Err(exceptions::PyTypeError::new_err(
"Input must be a List[Union[str, AddedToken]]",
))
}
})
.collect::<PyResult<Vec<_>>>()?;
Ok(self.tokenizer.add_special_tokens(&tokens))
}
/// Train the Tokenizer using the given files.
///
/// Reads the files line by line, while keeping all the whitespace, even new lines.
/// If you want to train from data store in-memory, you can check
/// :meth:`~tokenizers.Tokenizer.train_from_iterator`
///
/// Args:
/// files (:obj:`List[str]`):
/// A list of path to the files that we should use for training
///
/// trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
/// An optional trainer that should be used to train our Model
#[pyo3(signature = (files, trainer = None))]
#[pyo3(text_signature = "(self, files, trainer = None)")]
fn train(&mut self, files: Vec<String>, trainer: Option<&mut PyTrainer>) -> PyResult<()> {
let mut trainer =
trainer.map_or_else(|| self.tokenizer.get_model().get_trainer(), |t| t.clone());
Python::with_gil(|py| {
py.allow_threads(|| {
ToPyResult(
self.tokenizer
.train_from_files(&mut trainer, files)
.map(|_| {}),
)
.into()
})
})
}
/// Train the Tokenizer using the provided iterator.
///
/// You can provide anything that is a Python Iterator
///
/// * A list of sequences :obj:`List[str]`
/// * A generator that yields :obj:`str` or :obj:`List[str]`
/// * A Numpy array of strings
/// * ...
///
/// Args:
/// iterator (:obj:`Iterator`):
/// Any iterator over strings or list of strings
///
/// trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
/// An optional trainer that should be used to train our Model
///
/// length (:obj:`int`, `optional`):
/// The total number of sequences in the iterator. This is used to
/// provide meaningful progress tracking
#[pyo3(signature = (iterator, trainer = None, length = None))]
#[pyo3(text_signature = "(self, iterator, trainer=None, length=None)")]
fn train_from_iterator(
&mut self,
py: Python,
iterator: &PyAny,
trainer: Option<&mut PyTrainer>,
length: Option<usize>,
) -> PyResult<()> {
let mut trainer =
trainer.map_or_else(|| self.tokenizer.get_model().get_trainer(), |t| t.clone());
let buffered_iter = PyBufferedIterator::new(
iterator,
|element| {
// Each element of the iterator can either be:
// - An iterator, to allow batching
// - A string
if let Ok(s) = element.downcast::<PyString>() {
itertools::Either::Right(std::iter::once(s.to_str().map(|s| s.to_owned())))
} else {
match element.iter() {
Ok(iter) => itertools::Either::Left(
iter.map(|i| i?.extract::<String>())
.collect::<Vec<_>>()
.into_iter(),
),
Err(e) => itertools::Either::Right(std::iter::once(Err(e))),
}
}
},
256,
)?;
py.allow_threads(|| {
ResultShunt::process(buffered_iter, |iter| {
self.tokenizer
.train(&mut trainer, MaybeSizedIterator::new(iter, length))
.map(|_| {})
.map_err(|e| exceptions::PyException::new_err(e.to_string()))
})?
})
}
/// Apply all the post-processing steps to the given encodings.
///
/// The various steps are:
///
/// 1. Truncate according to the set truncation params (provided with
/// :meth:`~tokenizers.Tokenizer.enable_truncation`)
/// 2. Apply the :class:`~tokenizers.processors.PostProcessor`
/// 3. Pad according to the set padding params (provided with
/// :meth:`~tokenizers.Tokenizer.enable_padding`)
///
/// Args:
/// encoding (:class:`~tokenizers.Encoding`):
/// The :class:`~tokenizers.Encoding` corresponding to the main sequence.
///
/// pair (:class:`~tokenizers.Encoding`, `optional`):
/// An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence.
///
/// add_special_tokens (:obj:`bool`):
/// Whether to add the special tokens
///
/// Returns:
/// :class:`~tokenizers.Encoding`: The final post-processed encoding
#[pyo3(signature = (encoding, pair = None, add_special_tokens = true))]
#[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")]
fn post_process(
&self,
encoding: &PyEncoding,
pair: Option<&PyEncoding>,
add_special_tokens: bool,
) -> PyResult<PyEncoding> {
ToPyResult(
self.tokenizer
.post_process(
encoding.encoding.clone(),
pair.map(|p| p.encoding.clone()),
add_special_tokens,
)
.map(|e| e.into()),
)
.into()
}
/// The :class:`~tokenizers.models.Model` in use by the Tokenizer
#[getter]
fn get_model(&self, py: Python<'_>) -> PyResult<PyObject> {
self.tokenizer.get_model().get_as_subtype(py)
}
/// Set the :class:`~tokenizers.models.Model`
#[setter]
fn set_model(&mut self, model: PyRef<PyModel>) {
self.tokenizer.with_model(model.clone());
}
/// The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer
#[getter]
fn get_normalizer(&self, py: Python<'_>) -> PyResult<PyObject> {
if let Some(n) = self.tokenizer.get_normalizer() {
n.get_as_subtype(py)
} else {
Ok(py.None())
}
}
/// Set the :class:`~tokenizers.normalizers.Normalizer`
#[setter]
fn set_normalizer(&mut self, normalizer: PyRef<PyNormalizer>) {
self.tokenizer.with_normalizer(normalizer.clone());
}
/// The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer
#[getter]
fn get_pre_tokenizer(&self, py: Python<'_>) -> PyResult<PyObject> {
if let Some(pt) = self.tokenizer.get_pre_tokenizer() {
pt.get_as_subtype(py)
} else {
Ok(py.None())
}
}
/// Set the :class:`~tokenizers.normalizers.Normalizer`
#[setter]
fn set_pre_tokenizer(&mut self, pretok: PyRef<PyPreTokenizer>) {
self.tokenizer.with_pre_tokenizer(pretok.clone());
}
/// The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer
#[getter]
fn get_post_processor(&self, py: Python<'_>) -> PyResult<PyObject> {
if let Some(n) = self.tokenizer.get_post_processor() {
n.get_as_subtype(py)
} else {
Ok(py.None())
}
}
/// Set the :class:`~tokenizers.processors.PostProcessor`
#[setter]
fn set_post_processor(&mut self, processor: PyRef<PyPostProcessor>) {
self.tokenizer.with_post_processor(processor.clone());
}
/// The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer
#[getter]
fn get_decoder(&self, py: Python<'_>) -> PyResult<PyObject> {
if let Some(dec) = self.tokenizer.get_decoder() {
dec.get_as_subtype(py)
} else {
Ok(py.None())
}
}
/// Set the :class:`~tokenizers.decoders.Decoder`
#[setter]
fn set_decoder(&mut self, decoder: PyRef<PyDecoder>) {
self.tokenizer.with_decoder(decoder.clone());
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::models::PyModel;
use crate::normalizers::{PyNormalizer, PyNormalizerTypeWrapper};
use std::sync::{Arc, RwLock};
use tempfile::NamedTempFile;
use tk::normalizers::{Lowercase, NFKC};
#[test]
fn serialize() {
let mut tokenizer = Tokenizer::new(PyModel::from(BPE::default()));
tokenizer.with_normalizer(PyNormalizer::new(PyNormalizerTypeWrapper::Sequence(vec![
Arc::new(RwLock::new(NFKC.into())),
Arc::new(RwLock::new(Lowercase.into())),
])));
let tmp = NamedTempFile::new().unwrap().into_temp_path();
tokenizer.save(&tmp, false).unwrap();
Tokenizer::from_file(&tmp).unwrap();
}
}