use std::collections::{hash_map::DefaultHasher, HashMap}; use std::hash::{Hash, Hasher}; use numpy::{npyffi, PyArray1}; use pyo3::class::basic::CompareOp; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::*; use pyo3::AsPyPointer; use tk::models::bpe::BPE; use tk::tokenizer::{ Model, PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl, TruncationDirection, TruncationParams, TruncationStrategy, }; use tk::utils::iter::ResultShunt; use tokenizers as tk; use super::decoders::PyDecoder; use super::encoding::PyEncoding; use super::error::{PyError, ToPyResult}; use super::models::PyModel; use super::normalizers::PyNormalizer; use super::pre_tokenizers::PyPreTokenizer; use super::trainers::PyTrainer; use crate::processors::PyPostProcessor; use crate::utils::{MaybeSizedIterator, PyBufferedIterator}; /// Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`. /// It can have special options that defines the way it should behave. /// /// Args: /// content (:obj:`str`): The content of the token /// /// single_word (:obj:`bool`, defaults to :obj:`False`): /// Defines whether this token should only match single words. If :obj:`True`, this /// token will never match inside of a word. For example the token ``ing`` would match /// on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`. /// The notion of "`inside of a word`" is defined by the word boundaries pattern in /// regular expressions (ie. the token should start and end with word boundaries). /// /// lstrip (:obj:`bool`, defaults to :obj:`False`): /// Defines whether this token should strip all potential whitespaces on its left side. /// If :obj:`True`, this token will greedily match any whitespace on its left. For /// example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text /// ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left). /// /// rstrip (:obj:`bool`, defaults to :obj:`False`): /// Defines whether this token should strip all potential whitespaces on its right /// side. If :obj:`True`, this token will greedily match any whitespace on its right. /// It works just like :obj:`lstrip` but on the right. /// /// normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`): /// Defines whether this token should match against the normalized version of the input /// text. For example, with the added token ``"yesterday"``, and a normalizer in charge of /// lowercasing the text, the token could be extract from the input ``"I saw a lion /// Yesterday"``. /// #[pyclass(dict, module = "tokenizers", name = "AddedToken")] pub struct PyAddedToken { pub content: String, pub is_special_token: bool, pub single_word: Option, pub lstrip: Option, pub rstrip: Option, pub normalized: Option, } impl PyAddedToken { pub fn from>(content: S, is_special_token: Option) -> Self { Self { content: content.into(), is_special_token: is_special_token.unwrap_or(false), single_word: None, lstrip: None, rstrip: None, normalized: None, } } pub fn get_token(&self) -> tk::tokenizer::AddedToken { let mut token = tk::AddedToken::from(&self.content, self.is_special_token); if let Some(sw) = self.single_word { token = token.single_word(sw); } if let Some(ls) = self.lstrip { token = token.lstrip(ls); } if let Some(rs) = self.rstrip { token = token.rstrip(rs); } if let Some(n) = self.normalized { token = token.normalized(n); } token } pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> { let dict = PyDict::new(py); let token = self.get_token(); dict.set_item("content", token.content)?; dict.set_item("single_word", token.single_word)?; dict.set_item("lstrip", token.lstrip)?; dict.set_item("rstrip", token.rstrip)?; dict.set_item("normalized", token.normalized)?; Ok(dict) } } impl From for PyAddedToken { fn from(token: tk::AddedToken) -> Self { Self { content: token.content, single_word: Some(token.single_word), lstrip: Some(token.lstrip), rstrip: Some(token.rstrip), normalized: Some(token.normalized), is_special_token: !token.normalized, } } } #[pymethods] impl PyAddedToken { #[new] #[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")] fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult { let mut token = PyAddedToken::from(content.unwrap_or(""), None); if let Some(kwargs) = kwargs { for (key, value) in kwargs { let key: &str = key.extract()?; match key { "single_word" => token.single_word = Some(value.extract()?), "lstrip" => token.lstrip = Some(value.extract()?), "rstrip" => token.rstrip = Some(value.extract()?), "normalized" => token.normalized = Some(value.extract()?), _ => println!("Ignored unknown kwarg option {}", key), } } } Ok(token) } fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> { self.as_pydict(py) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { match state.extract::<&PyDict>(py) { Ok(state) => { for (key, value) in state { let key: &str = key.extract()?; match key { "content" => self.content = value.extract()?, "single_word" => self.single_word = Some(value.extract()?), "lstrip" => self.lstrip = Some(value.extract()?), "rstrip" => self.rstrip = Some(value.extract()?), "normalized" => self.normalized = Some(value.extract()?), _ => {} } } Ok(()) } Err(e) => Err(e), } } /// Get the content of this :obj:`AddedToken` #[getter] fn get_content(&self) -> &str { &self.content } /// Get the value of the :obj:`rstrip` option #[getter] fn get_rstrip(&self) -> bool { self.get_token().rstrip } /// Get the value of the :obj:`lstrip` option #[getter] fn get_lstrip(&self) -> bool { self.get_token().lstrip } /// Get the value of the :obj:`single_word` option #[getter] fn get_single_word(&self) -> bool { self.get_token().single_word } /// Get the value of the :obj:`normalized` option #[getter] fn get_normalized(&self) -> bool { self.get_token().normalized } fn __str__(&self) -> PyResult<&str> { Ok(&self.content) } fn __repr__(&self) -> PyResult { let bool_to_python = |p| match p { true => "True", false => "False", }; let token = self.get_token(); Ok(format!( "AddedToken(\"{}\", rstrip={}, lstrip={}, single_word={}, normalized={})", self.content, bool_to_python(token.rstrip), bool_to_python(token.lstrip), bool_to_python(token.single_word), bool_to_python(token.normalized) )) } fn __richcmp__(&self, other: Py, op: CompareOp) -> bool { use CompareOp::*; Python::with_gil(|py| match op { Lt | Le | Gt | Ge => false, Eq => self.get_token() == other.borrow(py).get_token(), Ne => self.get_token() != other.borrow(py).get_token(), }) } fn __hash__(&self) -> u64 { let mut hasher = DefaultHasher::new(); self.get_token().hash(&mut hasher); hasher.finish() } } struct TextInputSequence<'s>(tk::InputSequence<'s>); impl<'s> FromPyObject<'s> for TextInputSequence<'s> { fn extract(ob: &'s PyAny) -> PyResult { let err = exceptions::PyTypeError::new_err("TextInputSequence must be str"); if let Ok(s) = ob.downcast::() { Ok(Self(s.to_string_lossy().into())) } else { Err(err) } } } impl<'s> From> for tk::InputSequence<'s> { fn from(s: TextInputSequence<'s>) -> Self { s.0 } } struct PyArrayUnicode(Vec); impl FromPyObject<'_> for PyArrayUnicode { fn extract(ob: &PyAny) -> PyResult { // SAFETY Making sure the pointer is a valid numpy array requires calling numpy C code if unsafe { npyffi::PyArray_Check(ob.py(), ob.as_ptr()) } == 0 { return Err(exceptions::PyTypeError::new_err("Expected an np.array")); } let arr = ob.as_ptr() as *mut npyffi::PyArrayObject; // SAFETY Getting all the metadata about the numpy array to check its sanity let (type_num, elsize, alignment, data, nd, flags) = unsafe { let desc = (*arr).descr; ( (*desc).type_num, (*desc).elsize as usize, (*desc).alignment as usize, (*arr).data, (*arr).nd, (*arr).flags, ) }; if nd != 1 { return Err(exceptions::PyTypeError::new_err( "Expected a 1 dimensional np.array", )); } if flags & (npyffi::NPY_ARRAY_C_CONTIGUOUS | npyffi::NPY_ARRAY_F_CONTIGUOUS) == 0 { return Err(exceptions::PyTypeError::new_err( "Expected a contiguous np.array", )); } if type_num != npyffi::types::NPY_TYPES::NPY_UNICODE as i32 { return Err(exceptions::PyTypeError::new_err( "Expected a np.array[dtype='U']", )); } // SAFETY Looking at the raw numpy data to create new owned Rust strings via copies (so it's safe afterwards). unsafe { let n_elem = *(*arr).dimensions as usize; let all_bytes = std::slice::from_raw_parts(data as *const u8, elsize * n_elem); let seq = (0..n_elem) .map(|i| { let bytes = &all_bytes[i * elsize..(i + 1) * elsize]; let unicode = pyo3::ffi::PyUnicode_FromKindAndData( pyo3::ffi::PyUnicode_4BYTE_KIND as _, bytes.as_ptr() as *const _, elsize as isize / alignment as isize, ); let py = ob.py(); let obj = PyObject::from_owned_ptr(py, unicode); let s = obj.downcast::(py)?; Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned()) }) .collect::>>()?; Ok(Self(seq)) } } } impl From for tk::InputSequence<'_> { fn from(s: PyArrayUnicode) -> Self { s.0.into() } } struct PyArrayStr(Vec); impl FromPyObject<'_> for PyArrayStr { fn extract(ob: &PyAny) -> PyResult { let array = ob.downcast::>()?; let seq = array .readonly() .as_array() .iter() .map(|obj| { let s = obj.downcast::(ob.py())?; Ok(s.to_string_lossy().into_owned()) }) .collect::>>()?; Ok(Self(seq)) } } impl From for tk::InputSequence<'_> { fn from(s: PyArrayStr) -> Self { s.0.into() } } struct PreTokenizedInputSequence<'s>(tk::InputSequence<'s>); impl<'s> FromPyObject<'s> for PreTokenizedInputSequence<'s> { fn extract(ob: &'s PyAny) -> PyResult { if let Ok(seq) = ob.extract::() { return Ok(Self(seq.into())); } if let Ok(seq) = ob.extract::() { return Ok(Self(seq.into())); } if let Ok(s) = ob.downcast::() { if let Ok(seq) = s.extract::>() { return Ok(Self(seq.into())); } } if let Ok(s) = ob.downcast::() { if let Ok(seq) = s.extract::>() { return Ok(Self(seq.into())); } } Err(exceptions::PyTypeError::new_err( "PreTokenizedInputSequence must be Union[List[str], Tuple[str]]", )) } } impl<'s> From> for tk::InputSequence<'s> { fn from(s: PreTokenizedInputSequence<'s>) -> Self { s.0 } } struct TextEncodeInput<'s>(tk::EncodeInput<'s>); impl<'s> FromPyObject<'s> for TextEncodeInput<'s> { fn extract(ob: &'s PyAny) -> PyResult { if let Ok(i) = ob.extract::() { return Ok(Self(i.into())); } if let Ok((i1, i2)) = ob.extract::<(TextInputSequence, TextInputSequence)>() { return Ok(Self((i1, i2).into())); } if let Ok(arr) = ob.extract::>() { if arr.len() == 2 { let first = arr[0].extract::()?; let second = arr[1].extract::()?; return Ok(Self((first, second).into())); } } Err(exceptions::PyTypeError::new_err( "TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]", )) } } impl<'s> From> for tk::tokenizer::EncodeInput<'s> { fn from(i: TextEncodeInput<'s>) -> Self { i.0 } } struct PreTokenizedEncodeInput<'s>(tk::EncodeInput<'s>); impl<'s> FromPyObject<'s> for PreTokenizedEncodeInput<'s> { fn extract(ob: &'s PyAny) -> PyResult { if let Ok(i) = ob.extract::() { return Ok(Self(i.into())); } if let Ok((i1, i2)) = ob.extract::<(PreTokenizedInputSequence, PreTokenizedInputSequence)>() { return Ok(Self((i1, i2).into())); } if let Ok(arr) = ob.extract::>() { if arr.len() == 2 { let first = arr[0].extract::()?; let second = arr[1].extract::()?; return Ok(Self((first, second).into())); } } Err(exceptions::PyTypeError::new_err( "PreTokenizedEncodeInput must be Union[PreTokenizedInputSequence, \ Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]]", )) } } impl<'s> From> for tk::tokenizer::EncodeInput<'s> { fn from(i: PreTokenizedEncodeInput<'s>) -> Self { i.0 } } type Tokenizer = TokenizerImpl; /// A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input /// and outputs an :class:`~tokenizers.Encoding`. /// /// Args: /// model (:class:`~tokenizers.models.Model`): /// The core algorithm that this :obj:`Tokenizer` should be using. /// #[pyclass(dict, module = "tokenizers", name = "Tokenizer")] #[derive(Clone)] pub struct PyTokenizer { tokenizer: Tokenizer, } impl PyTokenizer { fn new(tokenizer: Tokenizer) -> Self { PyTokenizer { tokenizer } } fn from_model(model: PyModel) -> Self { PyTokenizer::new(TokenizerImpl::new(model)) } } #[pymethods] impl PyTokenizer { #[new] #[pyo3(text_signature = "(self, model)")] fn __new__(model: PyRef) -> Self { PyTokenizer::from_model(model.clone()) } fn __getstate__(&self, py: Python) -> PyResult { let data = serde_json::to_string(&self.tokenizer).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to pickle Tokenizer: {}", e )) })?; Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { match state.extract::<&PyBytes>(py) { Ok(s) => { self.tokenizer = serde_json::from_slice(s.as_bytes()).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle Tokenizer: {}", e )) })?; Ok(()) } Err(e) => Err(e), } } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { let model = PyModel::from(BPE::default()).into_py(py); PyTuple::new(py, vec![model]) } /// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string. /// /// Args: /// json (:obj:`str`): /// A valid JSON string representing a previously serialized /// :class:`~tokenizers.Tokenizer` /// /// Returns: /// :class:`~tokenizers.Tokenizer`: The new tokenizer #[staticmethod] #[pyo3(text_signature = "(json)")] fn from_str(json: &str) -> PyResult { let tokenizer: PyResult<_> = ToPyResult(json.parse()).into(); Ok(Self::new(tokenizer?)) } /// Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path. /// /// Args: /// path (:obj:`str`): /// A path to a local JSON file representing a previously serialized /// :class:`~tokenizers.Tokenizer` /// /// Returns: /// :class:`~tokenizers.Tokenizer`: The new tokenizer #[staticmethod] #[pyo3(text_signature = "(path)")] fn from_file(path: &str) -> PyResult { let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into(); Ok(Self::new(tokenizer?)) } /// Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer. /// /// Args: /// buffer (:obj:`bytes`): /// A buffer containing a previously serialized :class:`~tokenizers.Tokenizer` /// /// Returns: /// :class:`~tokenizers.Tokenizer`: The new tokenizer #[staticmethod] #[pyo3(text_signature = "(buffer)")] fn from_buffer(buffer: &PyBytes) -> PyResult { let tokenizer = serde_json::from_slice(buffer.as_bytes()).map_err(|e| { exceptions::PyValueError::new_err(format!( "Cannot instantiate Tokenizer from buffer: {}", e )) })?; Ok(Self { tokenizer }) } /// Instantiate a new :class:`~tokenizers.Tokenizer` from an existing file on the /// Hugging Face Hub. /// /// Args: /// identifier (:obj:`str`): /// The identifier of a Model on the Hugging Face Hub, that contains /// a tokenizer.json file /// revision (:obj:`str`, defaults to `main`): /// A branch or commit id /// auth_token (:obj:`str`, `optional`, defaults to `None`): /// An optional auth token used to access private repositories on the /// Hugging Face Hub /// /// Returns: /// :class:`~tokenizers.Tokenizer`: The new tokenizer #[staticmethod] #[pyo3(signature = (identifier, revision = String::from("main"), auth_token = None))] #[pyo3(text_signature = "(identifier, revision=\"main\", auth_token=None)")] fn from_pretrained( identifier: &str, revision: String, auth_token: Option, ) -> PyResult { let params = tk::FromPretrainedParameters { revision, auth_token, user_agent: [("bindings", "Python"), ("version", crate::VERSION)] .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), }; let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_pretrained(identifier, Some(params))).into(); Ok(Self::new(tokenizer?)) } /// Gets a serialized string representing this :class:`~tokenizers.Tokenizer`. /// /// Args: /// pretty (:obj:`bool`, defaults to :obj:`False`): /// Whether the JSON string should be pretty formatted. /// /// Returns: /// :obj:`str`: A string representing the serialized Tokenizer #[pyo3(signature = (pretty = false))] #[pyo3(text_signature = "(self, pretty=False)")] fn to_str(&self, pretty: bool) -> PyResult { ToPyResult(self.tokenizer.to_string(pretty)).into() } /// Save the :class:`~tokenizers.Tokenizer` to the file at the given path. /// /// Args: /// path (:obj:`str`): /// A path to a file in which to save the serialized tokenizer. /// /// pretty (:obj:`bool`, defaults to :obj:`True`): /// Whether the JSON file should be pretty formatted. #[pyo3(signature = (path, pretty = true))] #[pyo3(text_signature = "(self, path, pretty=True)")] fn save(&self, path: &str, pretty: bool) -> PyResult<()> { ToPyResult(self.tokenizer.save(path, pretty)).into() } /// Return the number of special tokens that would be added for single/pair sentences. /// :param is_pair: Boolean indicating if the input would be a single sentence or a pair /// :return: #[pyo3(text_signature = "(self, is_pair)")] fn num_special_tokens_to_add(&self, is_pair: bool) -> usize { self.tokenizer .get_post_processor() .map_or(0, |p| p.added_tokens(is_pair)) } /// Get the underlying vocabulary /// /// Args: /// with_added_tokens (:obj:`bool`, defaults to :obj:`True`): /// Whether to include the added tokens /// /// Returns: /// :obj:`Dict[str, int]`: The vocabulary #[pyo3(signature = (with_added_tokens = true))] #[pyo3(text_signature = "(self, with_added_tokens=True)")] fn get_vocab(&self, with_added_tokens: bool) -> HashMap { self.tokenizer.get_vocab(with_added_tokens) } /// Get the size of the underlying vocabulary /// /// Args: /// with_added_tokens (:obj:`bool`, defaults to :obj:`True`): /// Whether to include the added tokens /// /// Returns: /// :obj:`int`: The size of the vocabulary #[pyo3(signature = (with_added_tokens = true))] #[pyo3(text_signature = "(self, with_added_tokens=True)")] fn get_vocab_size(&self, with_added_tokens: bool) -> usize { self.tokenizer.get_vocab_size(with_added_tokens) } /// Enable truncation /// /// Args: /// max_length (:obj:`int`): /// The max length at which to truncate /// /// stride (:obj:`int`, `optional`): /// The length of the previous first sequence to be included in the overflowing /// sequence /// /// strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`): /// The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or /// ``only_second``. /// /// direction (:obj:`str`, defaults to :obj:`right`): /// Truncate direction #[pyo3(signature = (max_length, **kwargs))] #[pyo3( text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')" )] fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { let mut params = TruncationParams { max_length, ..Default::default() }; if let Some(kwargs) = kwargs { for (key, value) in kwargs { let key: &str = key.extract()?; match key { "stride" => params.stride = value.extract()?, "strategy" => { let value: &str = value.extract()?; params.strategy = match value { "longest_first" => Ok(TruncationStrategy::LongestFirst), "only_first" => Ok(TruncationStrategy::OnlyFirst), "only_second" => Ok(TruncationStrategy::OnlySecond), _ => Err(PyError(format!( "Unknown `strategy`: `{}`. Use \ one of `longest_first`, `only_first`, or `only_second`", value )) .into_pyerr::()), }? } "direction" => { let value: &str = value.extract()?; params.direction = match value { "left" => Ok(TruncationDirection::Left), "right" => Ok(TruncationDirection::Right), _ => Err(PyError(format!( "Unknown `direction`: `{}`. Use \ one of `left` or `right`.", value )) .into_pyerr::()), }? } _ => println!("Ignored unknown kwarg option {}", key), } } } if let Err(error_message) = self.tokenizer.with_truncation(Some(params)) { return Err(PyError(error_message.to_string()).into_pyerr::()); } Ok(()) } /// Disable truncation #[pyo3(text_signature = "(self)")] fn no_truncation(&mut self) { self.tokenizer .with_truncation(None) .expect("Failed to set truncation to `None`! This should never happen"); } /// Get the currently set truncation parameters /// /// `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead` /// /// Returns: /// (:obj:`dict`, `optional`): /// A dict with the current truncation parameters if truncation is enabled #[getter] fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult> { self.tokenizer.get_truncation().map_or(Ok(None), |params| { let dict = PyDict::new(py); dict.set_item("max_length", params.max_length)?; dict.set_item("stride", params.stride)?; dict.set_item("strategy", params.strategy.as_ref())?; dict.set_item("direction", params.direction.as_ref())?; Ok(Some(dict)) }) } /// Enable the padding /// /// Args: /// direction (:obj:`str`, `optional`, defaults to :obj:`right`): /// The direction in which to pad. Can be either ``right`` or ``left`` /// /// pad_to_multiple_of (:obj:`int`, `optional`): /// If specified, the padding length should always snap to the next multiple of the /// given value. For example if we were going to pad witha length of 250 but /// ``pad_to_multiple_of=8`` then we will pad to 256. /// /// pad_id (:obj:`int`, defaults to 0): /// The id to be used when padding /// /// pad_type_id (:obj:`int`, defaults to 0): /// The type id to be used when padding /// /// pad_token (:obj:`str`, defaults to :obj:`[PAD]`): /// The pad token to be used when padding /// /// length (:obj:`int`, `optional`): /// If specified, the length at which to pad. If not specified we pad using the size of /// the longest sequence in a batch. #[pyo3(signature = (**kwargs))] #[pyo3( text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)" )] fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> { let mut params = PaddingParams::default(); if let Some(kwargs) = kwargs { for (key, value) in kwargs { let key: &str = key.extract()?; match key { "direction" => { let value: &str = value.extract()?; params.direction = match value { "left" => Ok(PaddingDirection::Left), "right" => Ok(PaddingDirection::Right), other => Err(PyError(format!( "Unknown `direction`: `{}`. Use \ one of `left` or `right`", other )) .into_pyerr::()), }?; } "pad_to_multiple_of" => { if let Some(multiple) = value.extract()? { params.pad_to_multiple_of = multiple; } } "pad_id" => params.pad_id = value.extract()?, "pad_type_id" => params.pad_type_id = value.extract()?, "pad_token" => params.pad_token = value.extract()?, "max_length" => { println!( "enable_padding(max_length=X) is deprecated, \ use enable_padding(length=X) instead" ); if let Some(l) = value.extract()? { params.strategy = PaddingStrategy::Fixed(l); } else { params.strategy = PaddingStrategy::BatchLongest; } } "length" => { if let Some(l) = value.extract()? { params.strategy = PaddingStrategy::Fixed(l); } else { params.strategy = PaddingStrategy::BatchLongest; } } _ => println!("Ignored unknown kwarg option {}", key), } } } self.tokenizer.with_padding(Some(params)); Ok(()) } /// Disable padding #[pyo3(text_signature = "(self)")] fn no_padding(&mut self) { self.tokenizer.with_padding(None); } /// Get the current padding parameters /// /// `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead` /// /// Returns: /// (:obj:`dict`, `optional`): /// A dict with the current padding parameters if padding is enabled #[getter] fn get_padding<'py>(&self, py: Python<'py>) -> PyResult> { self.tokenizer.get_padding().map_or(Ok(None), |params| { let dict = PyDict::new(py); dict.set_item( "length", match params.strategy { tk::PaddingStrategy::BatchLongest => None, tk::PaddingStrategy::Fixed(size) => Some(size), }, )?; dict.set_item("pad_to_multiple_of", params.pad_to_multiple_of)?; dict.set_item("pad_id", params.pad_id)?; dict.set_item("pad_token", ¶ms.pad_token)?; dict.set_item("pad_type_id", params.pad_type_id)?; dict.set_item("direction", params.direction.as_ref())?; Ok(Some(dict)) }) } /// Encode the given sequence and pair. This method can process raw text sequences /// as well as already pre-tokenized sequences. /// /// Example: /// Here are some examples of the inputs that are accepted:: /// /// encode("A single sequence")` /// encode("A sequence", "And its pair")` /// encode([ "A", "pre", "tokenized", "sequence" ], is_pretokenized=True)` /// encode( /// [ "A", "pre", "tokenized", "sequence" ], [ "And", "its", "pair" ], /// is_pretokenized=True /// ) /// /// Args: /// sequence (:obj:`~tokenizers.InputSequence`): /// The main input sequence we want to encode. This sequence can be either raw /// text or pre-tokenized, according to the ``is_pretokenized`` argument: /// /// - If ``is_pretokenized=False``: :class:`~tokenizers.TextInputSequence` /// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedInputSequence` /// /// pair (:obj:`~tokenizers.InputSequence`, `optional`): /// An optional input sequence. The expected format is the same that for ``sequence``. /// /// is_pretokenized (:obj:`bool`, defaults to :obj:`False`): /// Whether the input is already pre-tokenized /// /// add_special_tokens (:obj:`bool`, defaults to :obj:`True`): /// Whether to add the special tokens /// /// Returns: /// :class:`~tokenizers.Encoding`: The encoded result /// #[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true))] #[pyo3( text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)" )] fn encode( &self, sequence: &PyAny, pair: Option<&PyAny>, is_pretokenized: bool, add_special_tokens: bool, ) -> PyResult { let sequence: tk::InputSequence = if is_pretokenized { sequence.extract::()?.into() } else { sequence.extract::()?.into() }; let input = match pair { Some(pair) => { let pair: tk::InputSequence = if is_pretokenized { pair.extract::()?.into() } else { pair.extract::()?.into() }; tk::EncodeInput::Dual(sequence, pair) } None => tk::EncodeInput::Single(sequence), }; ToPyResult( self.tokenizer .encode_char_offsets(input, add_special_tokens) .map(|e| e.into()), ) .into() } /// Encode the given batch of inputs. This method accept both raw text sequences /// as well as already pre-tokenized sequences. /// /// Example: /// Here are some examples of the inputs that are accepted:: /// /// encode_batch([ /// "A single sequence", /// ("A tuple with a sequence", "And its pair"), /// [ "A", "pre", "tokenized", "sequence" ], /// ([ "A", "pre", "tokenized", "sequence" ], "And its pair") /// ]) /// /// Args: /// input (A :obj:`List`/:obj:`Tuple` of :obj:`~tokenizers.EncodeInput`): /// A list of single sequences or pair sequences to encode. Each sequence /// can be either raw text or pre-tokenized, according to the ``is_pretokenized`` /// argument: /// /// - If ``is_pretokenized=False``: :class:`~tokenizers.TextEncodeInput` /// - If ``is_pretokenized=True``: :class:`~tokenizers.PreTokenizedEncodeInput` /// /// is_pretokenized (:obj:`bool`, defaults to :obj:`False`): /// Whether the input is already pre-tokenized /// /// add_special_tokens (:obj:`bool`, defaults to :obj:`True`): /// Whether to add the special tokens /// /// Returns: /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch /// #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))] #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")] fn encode_batch( &self, py: Python<'_>, input: Vec<&PyAny>, is_pretokenized: bool, add_special_tokens: bool, ) -> PyResult> { let input: Vec = input .into_iter() .map(|o| { let input: tk::EncodeInput = if is_pretokenized { o.extract::()?.into() } else { o.extract::()?.into() }; Ok(input) }) .collect::>>()?; py.allow_threads(|| { ToPyResult( self.tokenizer .encode_batch_char_offsets(input, add_special_tokens) .map(|encodings| encodings.into_iter().map(|e| e.into()).collect()), ) .into() }) } /// Decode the given list of ids back to a string /// /// This is used to decode anything coming back from a Language Model /// /// Args: /// ids (A :obj:`List/Tuple` of :obj:`int`): /// The list of ids that we want to decode /// /// skip_special_tokens (:obj:`bool`, defaults to :obj:`True`): /// Whether the special tokens should be removed from the decoded string /// /// Returns: /// :obj:`str`: The decoded string #[pyo3(signature = (ids, skip_special_tokens = true))] #[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")] fn decode(&self, ids: Vec, skip_special_tokens: bool) -> PyResult { ToPyResult(self.tokenizer.decode(&ids, skip_special_tokens)).into() } /// Decode a batch of ids back to their corresponding string /// /// Args: /// sequences (:obj:`List` of :obj:`List[int]`): /// The batch of sequences we want to decode /// /// skip_special_tokens (:obj:`bool`, defaults to :obj:`True`): /// Whether the special tokens should be removed from the decoded strings /// /// Returns: /// :obj:`List[str]`: A list of decoded strings #[pyo3(signature = (sequences, skip_special_tokens = true))] #[pyo3(text_signature = "(self, sequences, skip_special_tokens=True)")] fn decode_batch( &self, py: Python<'_>, sequences: Vec>, skip_special_tokens: bool, ) -> PyResult> { py.allow_threads(|| { let slices = sequences.iter().map(|v| &v[..]).collect::>(); ToPyResult(self.tokenizer.decode_batch(&slices, skip_special_tokens)).into() }) } /// Convert the given token to its corresponding id if it exists /// /// Args: /// token (:obj:`str`): /// The token to convert /// /// Returns: /// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary #[pyo3(text_signature = "(self, token)")] fn token_to_id(&self, token: &str) -> Option { self.tokenizer.token_to_id(token) } /// Convert the given id to its corresponding token if it exists /// /// Args: /// id (:obj:`int`): /// The id to convert /// /// Returns: /// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary #[pyo3(text_signature = "(self, id)")] fn id_to_token(&self, id: u32) -> Option { self.tokenizer.id_to_token(id) } /// Add the given tokens to the vocabulary /// /// The given tokens are added only if they don't already exist in the vocabulary. /// Each token then gets a new attributed id. /// /// Args: /// tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`): /// The list of tokens we want to add to the vocabulary. Each token can be either a /// string or an instance of :class:`~tokenizers.AddedToken` for more customization. /// /// Returns: /// :obj:`int`: The number of tokens that were created in the vocabulary #[pyo3(text_signature = "(self, tokens)")] fn add_tokens(&mut self, tokens: &PyList) -> PyResult { let tokens = tokens .into_iter() .map(|token| { if let Ok(content) = token.extract::() { Ok(PyAddedToken::from(content, Some(false)).get_token()) } else if let Ok(mut token) = token.extract::>() { token.is_special_token = false; Ok(token.get_token()) } else { Err(exceptions::PyTypeError::new_err( "Input must be a List[Union[str, AddedToken]]", )) } }) .collect::>>()?; Ok(self.tokenizer.add_tokens(&tokens)) } /// Add the given special tokens to the Tokenizer. /// /// If these tokens are already part of the vocabulary, it just let the Tokenizer know about /// them. If they don't exist, the Tokenizer creates them, giving them a new id. /// /// These special tokens will never be processed by the model (ie won't be split into /// multiple tokens), and they can be removed from the output when decoding. /// /// Args: /// tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`): /// The list of special tokens we want to add to the vocabulary. Each token can either /// be a string or an instance of :class:`~tokenizers.AddedToken` for more /// customization. /// /// Returns: /// :obj:`int`: The number of tokens that were created in the vocabulary #[pyo3(text_signature = "(self, tokens)")] fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult { let tokens = tokens .into_iter() .map(|token| { if let Ok(content) = token.extract::() { Ok(tk::tokenizer::AddedToken::from(content, true)) } else if let Ok(mut token) = token.extract::>() { token.is_special_token = true; Ok(token.get_token()) } else { Err(exceptions::PyTypeError::new_err( "Input must be a List[Union[str, AddedToken]]", )) } }) .collect::>>()?; Ok(self.tokenizer.add_special_tokens(&tokens)) } /// Train the Tokenizer using the given files. /// /// Reads the files line by line, while keeping all the whitespace, even new lines. /// If you want to train from data store in-memory, you can check /// :meth:`~tokenizers.Tokenizer.train_from_iterator` /// /// Args: /// files (:obj:`List[str]`): /// A list of path to the files that we should use for training /// /// trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`): /// An optional trainer that should be used to train our Model #[pyo3(signature = (files, trainer = None))] #[pyo3(text_signature = "(self, files, trainer = None)")] fn train(&mut self, files: Vec, trainer: Option<&mut PyTrainer>) -> PyResult<()> { let mut trainer = trainer.map_or_else(|| self.tokenizer.get_model().get_trainer(), |t| t.clone()); Python::with_gil(|py| { py.allow_threads(|| { ToPyResult( self.tokenizer .train_from_files(&mut trainer, files) .map(|_| {}), ) .into() }) }) } /// Train the Tokenizer using the provided iterator. /// /// You can provide anything that is a Python Iterator /// /// * A list of sequences :obj:`List[str]` /// * A generator that yields :obj:`str` or :obj:`List[str]` /// * A Numpy array of strings /// * ... /// /// Args: /// iterator (:obj:`Iterator`): /// Any iterator over strings or list of strings /// /// trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`): /// An optional trainer that should be used to train our Model /// /// length (:obj:`int`, `optional`): /// The total number of sequences in the iterator. This is used to /// provide meaningful progress tracking #[pyo3(signature = (iterator, trainer = None, length = None))] #[pyo3(text_signature = "(self, iterator, trainer=None, length=None)")] fn train_from_iterator( &mut self, py: Python, iterator: &PyAny, trainer: Option<&mut PyTrainer>, length: Option, ) -> PyResult<()> { let mut trainer = trainer.map_or_else(|| self.tokenizer.get_model().get_trainer(), |t| t.clone()); let buffered_iter = PyBufferedIterator::new( iterator, |element| { // Each element of the iterator can either be: // - An iterator, to allow batching // - A string if let Ok(s) = element.downcast::() { itertools::Either::Right(std::iter::once(s.to_str().map(|s| s.to_owned()))) } else { match element.iter() { Ok(iter) => itertools::Either::Left( iter.map(|i| i?.extract::()) .collect::>() .into_iter(), ), Err(e) => itertools::Either::Right(std::iter::once(Err(e))), } } }, 256, )?; py.allow_threads(|| { ResultShunt::process(buffered_iter, |iter| { self.tokenizer .train(&mut trainer, MaybeSizedIterator::new(iter, length)) .map(|_| {}) .map_err(|e| exceptions::PyException::new_err(e.to_string())) })? }) } /// Apply all the post-processing steps to the given encodings. /// /// The various steps are: /// /// 1. Truncate according to the set truncation params (provided with /// :meth:`~tokenizers.Tokenizer.enable_truncation`) /// 2. Apply the :class:`~tokenizers.processors.PostProcessor` /// 3. Pad according to the set padding params (provided with /// :meth:`~tokenizers.Tokenizer.enable_padding`) /// /// Args: /// encoding (:class:`~tokenizers.Encoding`): /// The :class:`~tokenizers.Encoding` corresponding to the main sequence. /// /// pair (:class:`~tokenizers.Encoding`, `optional`): /// An optional :class:`~tokenizers.Encoding` corresponding to the pair sequence. /// /// add_special_tokens (:obj:`bool`): /// Whether to add the special tokens /// /// Returns: /// :class:`~tokenizers.Encoding`: The final post-processed encoding #[pyo3(signature = (encoding, pair = None, add_special_tokens = true))] #[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")] fn post_process( &self, encoding: &PyEncoding, pair: Option<&PyEncoding>, add_special_tokens: bool, ) -> PyResult { ToPyResult( self.tokenizer .post_process( encoding.encoding.clone(), pair.map(|p| p.encoding.clone()), add_special_tokens, ) .map(|e| e.into()), ) .into() } /// The :class:`~tokenizers.models.Model` in use by the Tokenizer #[getter] fn get_model(&self, py: Python<'_>) -> PyResult { self.tokenizer.get_model().get_as_subtype(py) } /// Set the :class:`~tokenizers.models.Model` #[setter] fn set_model(&mut self, model: PyRef) { self.tokenizer.with_model(model.clone()); } /// The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer #[getter] fn get_normalizer(&self, py: Python<'_>) -> PyResult { if let Some(n) = self.tokenizer.get_normalizer() { n.get_as_subtype(py) } else { Ok(py.None()) } } /// Set the :class:`~tokenizers.normalizers.Normalizer` #[setter] fn set_normalizer(&mut self, normalizer: PyRef) { self.tokenizer.with_normalizer(normalizer.clone()); } /// The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer #[getter] fn get_pre_tokenizer(&self, py: Python<'_>) -> PyResult { if let Some(pt) = self.tokenizer.get_pre_tokenizer() { pt.get_as_subtype(py) } else { Ok(py.None()) } } /// Set the :class:`~tokenizers.normalizers.Normalizer` #[setter] fn set_pre_tokenizer(&mut self, pretok: PyRef) { self.tokenizer.with_pre_tokenizer(pretok.clone()); } /// The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer #[getter] fn get_post_processor(&self, py: Python<'_>) -> PyResult { if let Some(n) = self.tokenizer.get_post_processor() { n.get_as_subtype(py) } else { Ok(py.None()) } } /// Set the :class:`~tokenizers.processors.PostProcessor` #[setter] fn set_post_processor(&mut self, processor: PyRef) { self.tokenizer.with_post_processor(processor.clone()); } /// The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer #[getter] fn get_decoder(&self, py: Python<'_>) -> PyResult { if let Some(dec) = self.tokenizer.get_decoder() { dec.get_as_subtype(py) } else { Ok(py.None()) } } /// Set the :class:`~tokenizers.decoders.Decoder` #[setter] fn set_decoder(&mut self, decoder: PyRef) { self.tokenizer.with_decoder(decoder.clone()); } } #[cfg(test)] mod test { use super::*; use crate::models::PyModel; use crate::normalizers::{PyNormalizer, PyNormalizerTypeWrapper}; use std::sync::{Arc, RwLock}; use tempfile::NamedTempFile; use tk::normalizers::{Lowercase, NFKC}; #[test] fn serialize() { let mut tokenizer = Tokenizer::new(PyModel::from(BPE::default())); tokenizer.with_normalizer(PyNormalizer::new(PyNormalizerTypeWrapper::Sequence(vec![ Arc::new(RwLock::new(NFKC.into())), Arc::new(RwLock::new(Lowercase.into())), ]))); let tmp = NamedTempFile::new().unwrap().into_temp_path(); tokenizer.save(&tmp, false).unwrap(); Tokenizer::from_file(&tmp).unwrap(); } }