diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index a18b4b28..079fbc52 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -9,24 +9,24 @@ name = "tokenizers" crate-type = ["cdylib"] [dependencies] -rayon = "1.8" +rayon = "1.10" serde = { version = "1.0", features = [ "rc", "derive" ]} serde_json = "1.0" libc = "0.2" -env_logger = "0.10.0" -pyo3 = { version = "0.20" } -numpy = "0.20.0" +env_logger = "0.11" +pyo3 = { version = "0.21" } +numpy = "0.21" ndarray = "0.15" onig = { version = "6.4", default-features = false } -itertools = "0.11" +itertools = "0.12" [dependencies.tokenizers] version = "0.16.0-dev.0" path = "../../tokenizers" [dev-dependencies] -tempfile = "3.8" -pyo3 = { version = "0.20", features = ["auto-initialize"] } +tempfile = "3.10" +pyo3 = { version = "0.21", features = ["auto-initialize"] } [features] defaut = ["pyo3/extension-module"] diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index f3d36532..1344aa3f 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -1,7 +1,6 @@ use std::sync::{Arc, RwLock}; use crate::pre_tokenizers::from_string; -use crate::utils::PyChar; use crate::utils::PyPattern; use pyo3::exceptions; use pyo3::prelude::*; @@ -85,7 +84,7 @@ impl PyDecoder { e )) })?; - Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -161,7 +160,7 @@ pub struct PyByteLevelDec {} impl PyByteLevelDec { #[new] #[pyo3(signature = (**_kwargs), text_signature = "(self)")] - fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) { + fn new(_kwargs: Option<&Bound<'_, PyDict>>) -> (Self, PyDecoder) { (PyByteLevelDec {}, ByteLevel::default().into()) } } @@ -318,8 +317,8 @@ impl PyMetaspaceDec { } #[setter] - fn set_replacement(self_: PyRef, replacement: PyChar) { - setter!(self_, Metaspace, @set_replacement, replacement.0); + fn set_replacement(self_: PyRef, replacement: char) { + setter!(self_, Metaspace, @set_replacement, replacement); } #[getter] @@ -352,16 +351,12 @@ impl PyMetaspaceDec { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")] - fn new( - replacement: PyChar, - prepend_scheme: String, - split: bool, - ) -> PyResult<(Self, PyDecoder)> { + #[pyo3(signature = (replacement = '▁', prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")] + fn new(replacement: char, prepend_scheme: String, split: bool) -> PyResult<(Self, PyDecoder)> { let prepend_scheme = from_string(prepend_scheme)?; Ok(( PyMetaspaceDec {}, - Metaspace::new(replacement.0, prepend_scheme, split).into(), + Metaspace::new(replacement, prepend_scheme, split).into(), )) } } @@ -463,7 +458,7 @@ pub struct PySequenceDecoder {} impl PySequenceDecoder { #[new] #[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")] - fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> { + fn new(decoders_py: &Bound<'_, PyList>) -> PyResult<(Self, PyDecoder)> { let mut decoders: Vec = Vec::with_capacity(decoders_py.len()); for decoder_py in decoders_py.iter() { let decoder: PyRef = decoder_py.extract()?; @@ -476,8 +471,8 @@ impl PySequenceDecoder { Ok((PySequenceDecoder {}, Sequence::new(decoders).into())) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, [PyList::empty(py)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { + PyTuple::new_bound(py, [PyList::empty_bound(py)]) } } @@ -497,7 +492,7 @@ impl Decoder for CustomDecoder { Python::with_gil(|py| { let decoded = self .inner - .call_method(py, "decode", (tokens,), None)? + .call_method_bound(py, "decode", (tokens,), None)? .extract(py)?; Ok(decoded) }) @@ -507,7 +502,7 @@ impl Decoder for CustomDecoder { Python::with_gil(|py| { let decoded = self .inner - .call_method(py, "decode_chain", (tokens,), None)? + .call_method_bound(py, "decode_chain", (tokens,), None)? .extract(py)?; Ok(decoded) }) @@ -572,7 +567,7 @@ impl Decoder for PyDecoderWrapper { /// Decoders Module #[pymodule] -pub fn decoders(_py: Python, m: &PyModule) -> PyResult<()> { +pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -602,7 +597,7 @@ mod test { Python::with_gil(|py| { let py_dec = PyDecoder::new(Metaspace::default().into()); let py_meta = py_dec.get_as_subtype(py).unwrap(); - assert_eq!("Metaspace", py_meta.as_ref(py).get_type().name().unwrap()); + assert_eq!("Metaspace", py_meta.bind(py).get_type().qualname().unwrap()); }) } diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs index 1ee3aa06..3ca48d56 100644 --- a/bindings/python/src/encoding.rs +++ b/bindings/python/src/encoding.rs @@ -37,7 +37,7 @@ impl PyEncoding { e )) })?; - Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -391,10 +391,10 @@ impl PyEncoding { #[pyo3( text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')" )] - fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { + fn pad(&mut self, length: usize, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<()> { let mut pad_id = 0; let mut pad_type_id = 0; - let mut pad_token = "[PAD]"; + let mut pad_token = "[PAD]".to_string(); let mut direction = PaddingDirection::Right; if let Some(kwargs) = kwargs { @@ -422,7 +422,7 @@ impl PyEncoding { } } self.encoding - .pad(length, pad_id, pad_type_id, pad_token, direction); + .pad(length, pad_id, pad_type_id, &pad_token, direction); Ok(()) } diff --git a/bindings/python/src/error.rs b/bindings/python/src/error.rs index 307fed28..888c0d44 100644 --- a/bindings/python/src/error.rs +++ b/bindings/python/src/error.rs @@ -35,7 +35,7 @@ impl ToPyResult { } pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> { - let deprecation_warning = py.import("builtins")?.getattr("DeprecationWarning")?; + let deprecation_warning = py.import_bound("builtins")?.getattr("DeprecationWarning")?; let full_message = format!("Deprecated in {}: {}", version, message); - pyo3::PyErr::warn(py, deprecation_warning, &full_message, 0) + pyo3::PyErr::warn_bound(py, &deprecation_warning, &full_message, 0) } diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 8625944e..3f1e7136 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -47,7 +47,7 @@ extern "C" fn child_after_fork() { /// Tokenizers Module #[pymodule] -pub fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> { +pub fn tokenizers(m: &Bound<'_, PyModule>) -> PyResult<()> { let _ = env_logger::try_init_from_env("TOKENIZERS_LOG"); // Register the fork callback diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 8fce02c9..846bb61c 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -105,7 +105,7 @@ impl PyModel { e )) })?; - Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -260,7 +260,10 @@ impl PyModel { pub struct PyBPE {} impl PyBPE { - fn with_builder(mut builder: BpeBuilder, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> { + fn with_builder( + mut builder: BpeBuilder, + kwargs: Option<&Bound<'_, PyDict>>, + ) -> PyResult<(Self, PyModel)> { if let Some(kwargs) = kwargs { for (key, value) in kwargs { let key: &str = key.extract()?; @@ -321,14 +324,14 @@ macro_rules! setter { } #[derive(FromPyObject)] -enum PyVocab<'a> { +enum PyVocab { Vocab(Vocab), - Filename(&'a str), + Filename(String), } #[derive(FromPyObject)] -enum PyMerges<'a> { +enum PyMerges { Merges(Merges), - Filename(&'a str), + Filename(String), } #[pymethods] @@ -417,7 +420,7 @@ impl PyBPE { py: Python<'_>, vocab: Option, merges: Option, - kwargs: Option<&PyDict>, + kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult<(Self, PyModel)> { if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) { return Err(exceptions::PyValueError::new_err( @@ -502,11 +505,11 @@ impl PyBPE { #[pyo3(signature = (vocab, merges, **kwargs))] #[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")] fn from_file( - _cls: &PyType, + _cls: &Bound<'_, PyType>, py: Python, vocab: &str, merges: &str, - kwargs: Option<&PyDict>, + kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult> { let (vocab, merges) = BPE::read_file(vocab, merges).map_err(|e| { exceptions::PyException::new_err(format!("Error while reading BPE files: {}", e)) @@ -540,7 +543,7 @@ pub struct PyWordPiece {} impl PyWordPiece { fn with_builder( mut builder: WordPieceBuilder, - kwargs: Option<&PyDict>, + kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult<(Self, PyModel)> { if let Some(kwargs) = kwargs { for (key, val) in kwargs { @@ -612,7 +615,7 @@ impl PyWordPiece { fn new( py: Python<'_>, vocab: Option, - kwargs: Option<&PyDict>, + kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult<(Self, PyModel)> { let mut builder = WordPiece::builder(); @@ -677,10 +680,10 @@ impl PyWordPiece { #[pyo3(signature = (vocab, **kwargs))] #[pyo3(text_signature = "(vocab, **kwargs)")] fn from_file( - _cls: &PyType, + _cls: &Bound<'_, PyType>, py: Python, vocab: &str, - kwargs: Option<&PyDict>, + kwargs: Option<&Bound<'_, PyDict>>, ) -> PyResult> { let vocab = WordPiece::read_file(vocab).map_err(|e| { exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e)) @@ -796,7 +799,7 @@ impl PyWordLevel { #[pyo3(signature = (vocab, unk_token = None))] #[pyo3(text_signature = "(vocab, unk_token)")] fn from_file( - _cls: &PyType, + _cls: &Bound<'_, PyType>, py: Python, vocab: &str, unk_token: Option, @@ -849,7 +852,7 @@ impl PyUnigram { /// Models Module #[pymodule] -pub fn models(_py: Python, m: &PyModule) -> PyResult<()> { +pub fn models(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -870,7 +873,7 @@ mod test { Python::with_gil(|py| { let py_model = PyModel::from(BPE::default()); let py_bpe = py_model.get_as_subtype(py).unwrap(); - assert_eq!("BPE", py_bpe.as_ref(py).get_type().name().unwrap()); + assert_eq!("BPE", py_bpe.bind(py).get_type().qualname().unwrap()); }) } diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 954ee5aa..645852fa 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -113,7 +113,7 @@ impl PyNormalizer { e )) })?; - Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -345,7 +345,7 @@ pub struct PySequence {} impl PySequence { #[new] #[pyo3(text_signature = None)] - fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> { + fn new(normalizers: &Bound<'_, PyList>) -> PyResult<(Self, PyNormalizer)> { let mut sequence = Vec::with_capacity(normalizers.len()); for n in normalizers.iter() { let normalizer: PyRef = n.extract()?; @@ -360,8 +360,8 @@ impl PySequence { )) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, [PyList::empty(py)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { + PyTuple::new_bound(py, [PyList::empty_bound(py)]) } fn __len__(&self) -> usize { @@ -467,11 +467,11 @@ pub struct PyPrecompiled {} impl PyPrecompiled { #[new] #[pyo3(text_signature = "(self, precompiled_charsmap)")] - fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> { - let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?; + fn new(precompiled_charsmap: Vec) -> PyResult<(Self, PyNormalizer)> { + // let precompiled_charsmap: Vec = FromPyObject::extract(py_precompiled_charsmap)?; Ok(( PyPrecompiled {}, - Precompiled::from(precompiled_charsmap) + Precompiled::from(&precompiled_charsmap) .map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to build Precompiled normalizer: {}", @@ -512,7 +512,7 @@ impl tk::tokenizer::Normalizer for CustomNormalizer { fn normalize(&self, normalized: &mut NormalizedString) -> tk::Result<()> { Python::with_gil(|py| { let normalized = PyNormalizedStringRefMut::new(normalized); - let py_normalized = self.inner.as_ref(py); + let py_normalized = self.inner.bind(py); py_normalized.call_method("normalize", (normalized.get(),), None)?; Ok(()) }) @@ -635,7 +635,7 @@ impl Normalizer for PyNormalizerWrapper { /// Normalizers Module #[pymodule] -pub fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> { +pub fn normalizers(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -667,7 +667,7 @@ mod test { Python::with_gil(|py| { let py_norm = PyNormalizer::new(NFC.into()); let py_nfc = py_norm.get_as_subtype(py).unwrap(); - assert_eq!("NFC", py_nfc.as_ref(py).get_type().name().unwrap()); + assert_eq!("NFC", py_nfc.bind(py).get_type().qualname().unwrap()); }) } diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 59cc394d..d6d332f0 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -118,7 +118,7 @@ impl PyPreTokenizer { e )) })?; - Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -263,7 +263,7 @@ impl PyByteLevel { fn new( add_prefix_space: bool, use_regex: bool, - _kwargs: Option<&PyDict>, + _kwargs: Option<&Bound<'_, PyDict>>, ) -> (Self, PyPreTokenizer) { ( PyByteLevel {}, @@ -352,8 +352,8 @@ impl PySplit { )) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, [" ", "removed"]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { + PyTuple::new_bound(py, [" ", "removed"]) } } @@ -372,21 +372,21 @@ impl PyCharDelimiterSplit { } #[setter] - fn set_delimiter(self_: PyRef, delimiter: PyChar) { - setter!(self_, Delimiter, delimiter, delimiter.0); + fn set_delimiter(self_: PyRef, delimiter: char) { + setter!(self_, Delimiter, delimiter, delimiter); } #[new] #[pyo3(text_signature = None)] - pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> { + pub fn new(delimiter: char) -> PyResult<(Self, PyPreTokenizer)> { Ok(( PyCharDelimiterSplit {}, - CharDelimiterSplit::new(delimiter.0).into(), + CharDelimiterSplit::new(delimiter).into(), )) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, [" "]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { + PyTuple::new_bound(py, [" "]) } } @@ -430,7 +430,7 @@ pub struct PySequence {} impl PySequence { #[new] #[pyo3(text_signature = "(self, pretokenizers)")] - fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> { + fn new(pre_tokenizers: &Bound<'_, PyList>) -> PyResult<(Self, PyPreTokenizer)> { let mut sequence = Vec::with_capacity(pre_tokenizers.len()); for n in pre_tokenizers.iter() { let pretokenizer: PyRef = n.extract()?; @@ -447,8 +447,8 @@ impl PySequence { )) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, [PyList::empty(py)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { + PyTuple::new_bound(py, [PyList::empty_bound(py)]) } } @@ -490,8 +490,8 @@ impl PyMetaspace { } #[setter] - fn set_replacement(self_: PyRef, replacement: PyChar) { - setter!(self_, Metaspace, @set_replacement, replacement.0); + fn set_replacement(self_: PyRef, replacement: char) { + setter!(self_, Metaspace, @set_replacement, replacement); } #[getter] @@ -524,15 +524,15 @@ impl PyMetaspace { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")] + #[pyo3(signature = (replacement = '▁', prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")] fn new( - replacement: PyChar, + replacement: char, prepend_scheme: String, split: bool, ) -> PyResult<(Self, PyPreTokenizer)> { // Create a new Metaspace instance let prepend_scheme = from_string(prepend_scheme)?; - let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme, split); + let new_instance: Metaspace = Metaspace::new(replacement, prepend_scheme, split); Ok((PyMetaspace {}, new_instance.into())) } } @@ -599,7 +599,7 @@ impl tk::tokenizer::PreTokenizer for CustomPreTokenizer { fn pre_tokenize(&self, sentence: &mut PreTokenizedString) -> tk::Result<()> { Python::with_gil(|py| { let pretok = PyPreTokenizedStringRefMut::new(sentence); - let py_pretok = self.inner.as_ref(py); + let py_pretok = self.inner.bind(py); py_pretok.call_method("pre_tokenize", (pretok.get(),), None)?; Ok(()) }) @@ -722,7 +722,7 @@ impl PreTokenizer for PyPreTokenizerWrapper { /// PreTokenizers Module #[pymodule] -pub fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> { +pub fn pre_tokenizers(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -754,7 +754,7 @@ mod test { Python::with_gil(|py| { let py_norm = PyPreTokenizer::new(Whitespace {}.into()); let py_wsp = py_norm.get_as_subtype(py).unwrap(); - assert_eq!("Whitespace", py_wsp.as_ref(py).get_type().name().unwrap()); + assert_eq!("Whitespace", py_wsp.bind(py).get_type().qualname().unwrap()); }) } diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 55a69287..c46d8ea4 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -78,7 +78,7 @@ impl PyPostProcessor { e )) })?; - Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -166,8 +166,8 @@ impl PyBertProcessing { ) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, [("", 0), ("", 0)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { + PyTuple::new_bound(py, [("", 0), ("", 0)]) } } @@ -216,8 +216,8 @@ impl PyRobertaProcessing { ) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, [("", 0), ("", 0)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { + PyTuple::new_bound(py, [("", 0), ("", 0)]) } } @@ -235,7 +235,10 @@ pub struct PyByteLevel {} impl PyByteLevel { #[new] #[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")] - fn new(trim_offsets: Option, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) { + fn new( + trim_offsets: Option, + _kwargs: Option<&Bound<'_, PyDict>>, + ) -> (Self, PyPostProcessor) { let mut byte_level = ByteLevel::default(); if let Some(to) = trim_offsets { @@ -304,7 +307,7 @@ impl FromPyObject<'_> for PyTemplate { Ok(Self( s.try_into().map_err(exceptions::PyValueError::new_err)?, )) - } else if let Ok(s) = ob.extract::>() { + } else if let Ok(s) = ob.extract::>() { Ok(Self( s.try_into().map_err(exceptions::PyValueError::new_err)?, )) @@ -424,7 +427,7 @@ pub struct PySequence {} impl PySequence { #[new] #[pyo3(signature = (processors_py), text_signature = "(self, processors)")] - fn new(processors_py: &PyList) -> (Self, PyPostProcessor) { + fn new(processors_py: &Bound<'_, PyList>) -> (Self, PyPostProcessor) { let mut processors: Vec = Vec::with_capacity(processors_py.len()); for n in processors_py.iter() { let processor: PyRef = n.extract().unwrap(); @@ -438,14 +441,14 @@ impl PySequence { ) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, [PyList::empty(py)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { + PyTuple::new_bound(py, [PyList::empty_bound(py)]) } } /// Processors Module #[pymodule] -pub fn processors(_py: Python, m: &PyModule) -> PyResult<()> { +pub fn processors(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -474,7 +477,7 @@ mod test { let py_bert = py_proc.get_as_subtype(py).unwrap(); assert_eq!( "BertProcessing", - py_bert.as_ref(py).get_type().name().unwrap() + py_bert.bind(py).get_type().qualname().unwrap() ); }) } diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 4e792ef5..c32619b5 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -98,8 +98,8 @@ impl PyAddedToken { token } - pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> { - let dict = PyDict::new(py); + pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult> { + let dict = PyDict::new_bound(py); let token = self.get_token(); dict.set_item("content", token.content)?; @@ -130,7 +130,7 @@ impl From for PyAddedToken { impl PyAddedToken { #[new] #[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False)")] - fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult { + fn __new__(content: Option<&str>, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult { let mut token = PyAddedToken::from(content.unwrap_or(""), None); if let Some(kwargs) = kwargs { @@ -150,7 +150,7 @@ impl PyAddedToken { Ok(token) } - fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> { + fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult> { self.as_pydict(py) } @@ -329,7 +329,7 @@ impl FromPyObject<'_> for PyArrayUnicode { ); let py = ob.py(); let obj = PyObject::from_owned_ptr(py, unicode); - let s = obj.downcast::(py)?; + let s = obj.downcast_bound::(py)?; Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned()) }) .collect::>>()?; @@ -353,7 +353,7 @@ impl FromPyObject<'_> for PyArrayStr { .as_array() .iter() .map(|obj| { - let s = obj.downcast::(ob.py())?; + let s = obj.downcast_bound::(ob.py())?; Ok(s.to_string_lossy().into_owned()) }) .collect::>>()?; @@ -377,12 +377,12 @@ impl<'s> FromPyObject<'s> for PreTokenizedInputSequence<'s> { return Ok(Self(seq.into())); } if let Ok(s) = ob.downcast::() { - if let Ok(seq) = s.extract::>() { + if let Ok(seq) = s.extract::>() { return Ok(Self(seq.into())); } } if let Ok(s) = ob.downcast::() { - if let Ok(seq) = s.extract::>() { + if let Ok(seq) = s.extract::>() { return Ok(Self(seq.into())); } } @@ -492,7 +492,7 @@ impl PyTokenizer { e )) })?; - Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -510,9 +510,9 @@ impl PyTokenizer { } } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { + fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { let model = PyModel::from(BPE::default()).into_py(py); - PyTuple::new(py, vec![model]) + PyTuple::new_bound(py, vec![model]) } /// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string. @@ -557,7 +557,7 @@ impl PyTokenizer { /// :class:`~tokenizers.Tokenizer`: The new tokenizer #[staticmethod] #[pyo3(text_signature = "(buffer)")] - fn from_buffer(buffer: &PyBytes) -> PyResult { + fn from_buffer(buffer: &Bound<'_, PyBytes>) -> PyResult { let tokenizer = serde_json::from_slice(buffer.as_bytes()).map_err(|e| { exceptions::PyValueError::new_err(format!( "Cannot instantiate Tokenizer from buffer: {}", @@ -591,18 +591,18 @@ impl PyTokenizer { auth_token: Option, ) -> PyResult { let path = Python::with_gil(|py| -> PyResult { - let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?; + let huggingface_hub = PyModule::import_bound(py, intern!(py, "huggingface_hub"))?; let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?; let kwargs = [ (intern!(py, "repo_id"), identifier), (intern!(py, "filename"), "tokenizer.json"), (intern!(py, "revision"), &revision), ] - .into_py_dict(py); + .into_py_dict_bound(py); if let Some(auth_token) = auth_token { kwargs.set_item(intern!(py, "token"), auth_token)?; } - let path: String = hf_hub_download.call((), Some(kwargs))?.extract()?; + let path: String = hf_hub_download.call((), Some(&kwargs))?.extract()?; Ok(path) })?; @@ -712,7 +712,11 @@ impl PyTokenizer { #[pyo3( text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')" )] - fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { + fn enable_truncation( + &mut self, + max_length: usize, + kwargs: Option<&Bound<'_, PyDict>>, + ) -> PyResult<()> { let mut params = TruncationParams { max_length, ..Default::default() @@ -777,9 +781,9 @@ impl PyTokenizer { /// (:obj:`dict`, `optional`): /// A dict with the current truncation parameters if truncation is enabled #[getter] - fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult> { + fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult>> { self.tokenizer.get_truncation().map_or(Ok(None), |params| { - let dict = PyDict::new(py); + let dict = PyDict::new_bound(py); dict.set_item("max_length", params.max_length)?; dict.set_item("stride", params.stride)?; @@ -817,7 +821,7 @@ impl PyTokenizer { #[pyo3( text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)" )] - fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> { + fn enable_padding(&mut self, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<()> { let mut params = PaddingParams::default(); if let Some(kwargs) = kwargs { @@ -887,9 +891,9 @@ impl PyTokenizer { /// (:obj:`dict`, `optional`): /// A dict with the current padding parameters if padding is enabled #[getter] - fn get_padding<'py>(&self, py: Python<'py>) -> PyResult> { + fn get_padding<'py>(&self, py: Python<'py>) -> PyResult>> { self.tokenizer.get_padding().map_or(Ok(None), |params| { - let dict = PyDict::new(py); + let dict = PyDict::new_bound(py); dict.set_item( "length", @@ -948,8 +952,8 @@ impl PyTokenizer { )] fn encode( &self, - sequence: &PyAny, - pair: Option<&PyAny>, + sequence: &Bound<'_, PyAny>, + pair: Option<&Bound<'_, PyAny>>, is_pretokenized: bool, add_special_tokens: bool, ) -> PyResult { @@ -1141,7 +1145,7 @@ impl PyTokenizer { /// Returns: /// :obj:`int`: The number of tokens that were created in the vocabulary #[pyo3(text_signature = "(self, tokens)")] - fn add_tokens(&mut self, tokens: &PyList) -> PyResult { + fn add_tokens(&mut self, tokens: &Bound<'_, PyList>) -> PyResult { let tokens = tokens .into_iter() .map(|token| { @@ -1178,7 +1182,7 @@ impl PyTokenizer { /// Returns: /// :obj:`int`: The number of tokens that were created in the vocabulary #[pyo3(text_signature = "(self, tokens)")] - fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult { + fn add_special_tokens(&mut self, tokens: &Bound<'_, PyList>) -> PyResult { let tokens = tokens .into_iter() .map(|token| { @@ -1251,7 +1255,7 @@ impl PyTokenizer { fn train_from_iterator( &mut self, py: Python, - iterator: &PyAny, + iterator: &Bound<'_, PyAny>, trainer: Option<&mut PyTrainer>, length: Option, ) -> PyResult<()> { diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index 707dc723..716e4cfe 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -2,7 +2,6 @@ use std::sync::{Arc, RwLock}; use crate::models::PyModel; use crate::tokenizer::PyAddedToken; -use crate::utils::PyChar; use pyo3::exceptions; use pyo3::prelude::*; use pyo3::types::*; @@ -52,7 +51,7 @@ impl PyTrainer { e )) })?; - Ok(PyBytes::new(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -215,7 +214,7 @@ impl PyBpeTrainer { } #[setter] - fn set_special_tokens(self_: PyRef, special_tokens: &PyList) -> PyResult<()> { + fn set_special_tokens(self_: PyRef, special_tokens: &Bound<'_, PyList>) -> PyResult<()> { setter!( self_, BpeTrainer, @@ -269,12 +268,12 @@ impl PyBpeTrainer { } #[setter] - fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { + fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { setter!( self_, BpeTrainer, initial_alphabet, - alphabet.into_iter().map(|c| c.0).collect() + alphabet.into_iter().collect() ); } @@ -300,7 +299,7 @@ impl PyBpeTrainer { #[new] #[pyo3(signature = (**kwargs), text_signature = None)] - pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { + pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::bpe::BpeTrainer::builder(); if let Some(kwargs) = kwargs { for (key, val) in kwargs { @@ -429,7 +428,7 @@ impl PyWordPieceTrainer { } #[setter] - fn set_special_tokens(self_: PyRef, special_tokens: &PyList) -> PyResult<()> { + fn set_special_tokens(self_: PyRef, special_tokens: &Bound<'_, PyList>) -> PyResult<()> { setter!( self_, WordPieceTrainer, @@ -473,12 +472,12 @@ impl PyWordPieceTrainer { } #[setter] - fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { + fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { setter!( self_, WordPieceTrainer, @set_initial_alphabet, - alphabet.into_iter().map(|c| c.0).collect() + alphabet.into_iter().collect() ); } @@ -507,7 +506,7 @@ impl PyWordPieceTrainer { signature = (** kwargs), text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)" )] - pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { + pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordpiece::WordPieceTrainer::builder(); if let Some(kwargs) = kwargs { for (key, val) in kwargs { @@ -621,7 +620,7 @@ impl PyWordLevelTrainer { } #[setter] - fn set_special_tokens(self_: PyRef, special_tokens: &PyList) -> PyResult<()> { + fn set_special_tokens(self_: PyRef, special_tokens: &Bound<'_, PyList>) -> PyResult<()> { setter!( self_, WordLevelTrainer, @@ -647,7 +646,7 @@ impl PyWordLevelTrainer { #[new] #[pyo3(signature = (**kwargs), text_signature = None)] - pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { + pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordlevel::WordLevelTrainer::builder(); if let Some(kwargs) = kwargs { @@ -767,7 +766,7 @@ impl PyUnigramTrainer { } #[setter] - fn set_special_tokens(self_: PyRef, special_tokens: &PyList) -> PyResult<()> { + fn set_special_tokens(self_: PyRef, special_tokens: &Bound<'_, PyList>) -> PyResult<()> { setter!( self_, UnigramTrainer, @@ -801,12 +800,12 @@ impl PyUnigramTrainer { } #[setter] - fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { + fn set_initial_alphabet(self_: PyRef, alphabet: Vec) { setter!( self_, UnigramTrainer, initial_alphabet, - alphabet.into_iter().map(|c| c.0).collect() + alphabet.into_iter().collect() ); } @@ -815,7 +814,7 @@ impl PyUnigramTrainer { signature = (**kwargs), text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)" )] - pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { + pub fn new(kwargs: Option>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::unigram::UnigramTrainer::builder(); if let Some(kwargs) = kwargs { for (key, val) in kwargs { @@ -874,7 +873,7 @@ impl PyUnigramTrainer { /// Trainers Module #[pymodule] -pub fn trainers(_py: Python, m: &PyModule) -> PyResult<()> { +pub fn trainers(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -893,7 +892,7 @@ mod tests { Python::with_gil(|py| { let py_trainer = PyTrainer::new(Arc::new(RwLock::new(BpeTrainer::default().into()))); let py_bpe = py_trainer.get_as_subtype(py).unwrap(); - assert_eq!("BpeTrainer", py_bpe.as_ref(py).get_type().name().unwrap()); + assert_eq!("BpeTrainer", py_bpe.bind(py).get_type().qualname().unwrap()); }) } } diff --git a/bindings/python/src/utils/iterators.rs b/bindings/python/src/utils/iterators.rs index d19a54eb..ebb9ac36 100644 --- a/bindings/python/src/utils/iterators.rs +++ b/bindings/python/src/utils/iterators.rs @@ -50,7 +50,7 @@ pub struct PyBufferedIterator { impl PyBufferedIterator where - F: Fn(&PyAny) -> I, + F: Fn(Bound<'_, PyAny>) -> I, I: IntoIterator>, { /// Create a new PyBufferedIterator using the provided Python object. @@ -62,10 +62,10 @@ where /// /// The `buffer_size` represents the number of items that we buffer before we /// need to acquire the GIL again. - pub fn new(iter: &PyAny, converter: F, buffer_size: usize) -> PyResult { + pub fn new(iter: &Bound<'_, PyAny>, converter: F, buffer_size: usize) -> PyResult { let py = iter.py(); let iter: Py = unsafe { - py.from_borrowed_ptr_or_err::(pyo3::ffi::PyObject_GetIter(iter.as_ptr()))? + Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))? .to_object(py) }; @@ -89,9 +89,10 @@ where } match unsafe { - py.from_owned_ptr_or_opt::(pyo3::ffi::PyIter_Next( - self.iter.as_ref().unwrap().as_ref(py).as_ptr(), - )) + Bound::from_owned_ptr_or_opt( + py, + pyo3::ffi::PyIter_Next(self.iter.as_ref().unwrap().bind(py).as_ptr()), + ) } { Some(obj) => self.buffer.extend((self.converter)(obj)), None => { @@ -112,7 +113,7 @@ where impl Iterator for PyBufferedIterator where - F: Fn(&PyAny) -> I, + F: Fn(Bound<'_, PyAny>) -> I, I: IntoIterator>, { type Item = PyResult; diff --git a/bindings/python/src/utils/mod.rs b/bindings/python/src/utils/mod.rs index e7f95f03..1e409a50 100644 --- a/bindings/python/src/utils/mod.rs +++ b/bindings/python/src/utils/mod.rs @@ -1,6 +1,3 @@ -use pyo3::exceptions; -use pyo3::prelude::*; -use pyo3::types::*; use std::marker::PhantomData; use std::sync::{Arc, Mutex}; @@ -14,25 +11,6 @@ pub use normalization::*; pub use pretokenization::*; pub use regex::*; -// PyChar -// This type is a temporary hack to accept `char` as argument -// To be removed once https://github.com/PyO3/pyo3/pull/1282 has been released -pub struct PyChar(pub char); - -impl FromPyObject<'_> for PyChar { - fn extract(obj: &PyAny) -> PyResult { - let s = >::try_from(obj)?.to_str()?; - let mut iter = s.chars(); - if let (Some(ch), None) = (iter.next(), iter.next()) { - Ok(Self(ch)) - } else { - Err(exceptions::PyValueError::new_err( - "expected a string of length 1", - )) - } - } -} - // RefMut utils pub trait DestroyPtr { diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index 11a06856..4cb3c7ce 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -9,15 +9,15 @@ use tk::pattern::Pattern; /// Represents a Pattern as used by `NormalizedString` #[derive(Clone, FromPyObject)] -pub enum PyPattern<'p> { +pub enum PyPattern { #[pyo3(annotation = "str")] - Str(&'p str), + Str(String), #[pyo3(annotation = "tokenizers.Regex")] Regex(Py), // TODO: Add the compatibility for Fn(char) -> bool } -impl Pattern for PyPattern<'_> { +impl Pattern for PyPattern { fn find_matches(&self, inside: &str) -> tk::Result> { match self { PyPattern::Str(s) => { @@ -35,8 +35,8 @@ impl Pattern for PyPattern<'_> { } } -impl From> for tk::normalizers::replace::ReplacePattern { - fn from(pattern: PyPattern<'_>) -> Self { +impl From for tk::normalizers::replace::ReplacePattern { + fn from(pattern: PyPattern) -> Self { match pattern { PyPattern::Str(s) => Self::String(s.to_owned()), PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())), @@ -44,8 +44,8 @@ impl From> for tk::normalizers::replace::ReplacePattern { } } -impl From> for tk::pre_tokenizers::split::SplitPattern { - fn from(pattern: PyPattern<'_>) -> Self { +impl From for tk::pre_tokenizers::split::SplitPattern { + fn from(pattern: PyPattern) -> Self { match pattern { PyPattern::Str(s) => Self::String(s.to_owned()), PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())), @@ -117,7 +117,7 @@ impl From for SplitDelimiterBehavior { } } -fn filter(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> { +fn filter(normalized: &mut NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> { let err = "`filter` expect a callable with the signature: `fn(char) -> bool`"; if !func.is_callable() { @@ -134,7 +134,7 @@ fn filter(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> { } } -fn for_each(normalized: &NormalizedString, func: &PyAny) -> PyResult<()> { +fn for_each(normalized: &NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> { let err = "`for_each` expect a callable with the signature: `fn(char)`"; if !func.is_callable() { @@ -148,14 +148,14 @@ fn for_each(normalized: &NormalizedString, func: &PyAny) -> PyResult<()> { } } -fn map(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> { +fn map(normalized: &mut NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> { let err = "`map` expect a callable with the signature: `fn(char) -> char`"; if !func.is_callable() { Err(exceptions::PyTypeError::new_err(err)) } else { normalized.map(|c| { - let c: &str = func + let c: String = func .call1((c.to_string(),)) .expect(err) .extract() @@ -296,13 +296,13 @@ impl PyNormalizedString { /// Filter each character of the string using the given func #[pyo3(text_signature = "(self, func)")] - fn filter(&mut self, func: &PyAny) -> PyResult<()> { + fn filter(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { filter(&mut self.normalized, func) } /// Calls the given function for each character of the string #[pyo3(text_signature = "(self, func)")] - fn for_each(&self, func: &PyAny) -> PyResult<()> { + fn for_each(&self, func: &Bound<'_, PyAny>) -> PyResult<()> { for_each(&self.normalized, func) } @@ -311,7 +311,7 @@ impl PyNormalizedString { /// Replaces each character of the string using the returned value. Each /// returned value **must** be a str of length 1 (ie a character). #[pyo3(text_signature = "(self, func)")] - fn map(&mut self, func: &PyAny) -> PyResult<()> { + fn map(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { map(&mut self.normalized, func) } @@ -551,21 +551,21 @@ impl PyNormalizedStringRefMut { .ok_or_else(PyNormalizedStringRefMut::destroyed_error)? } - fn filter(&mut self, func: &PyAny) -> PyResult<()> { + fn filter(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { self.inner .map_mut(|n| filter(n, func)) .ok_or_else(PyNormalizedStringRefMut::destroyed_error)??; Ok(()) } - fn for_each(&self, func: &PyAny) -> PyResult<()> { + fn for_each(&self, func: &Bound<'_, PyAny>) -> PyResult<()> { self.inner .map(|n| for_each(n, func)) .ok_or_else(PyNormalizedStringRefMut::destroyed_error)??; Ok(()) } - fn map(&mut self, func: &PyAny) -> PyResult<()> { + fn map(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { self.inner .map_mut(|n| map(n, func)) .ok_or_else(PyNormalizedStringRefMut::destroyed_error)??; diff --git a/bindings/python/src/utils/pretokenization.rs b/bindings/python/src/utils/pretokenization.rs index a93560ab..88fdd19f 100644 --- a/bindings/python/src/utils/pretokenization.rs +++ b/bindings/python/src/utils/pretokenization.rs @@ -12,7 +12,7 @@ use crate::error::ToPyResult; use crate::token::PyToken; use tk::{OffsetReferential, OffsetType, Offsets, PreTokenizedString, Token}; -fn split(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> { +fn split(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> { if !func.is_callable() { Err(exceptions::PyTypeError::new_err( "`split` expect a callable with the signature: \ @@ -30,7 +30,7 @@ fn split(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> { } } -fn normalize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> { +fn normalize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> { if !func.is_callable() { Err(exceptions::PyTypeError::new_err( "`normalize` expect a callable with the signature: \ @@ -46,7 +46,7 @@ fn normalize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> { } } -fn tokenize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> { +fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> { if !func.is_callable() { Err(exceptions::PyTypeError::new_err( "`tokenize` expect a callable with the signature: \ @@ -183,7 +183,7 @@ impl PyPreTokenizedString { /// In order for the offsets to be tracked accurately, any returned `NormalizedString` /// should come from calling either `.split` or `.slice` on the received one. #[pyo3(text_signature = "(self, func)")] - fn split(&mut self, func: &PyAny) -> PyResult<()> { + fn split(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { split(&mut self.pretok, func) } @@ -195,7 +195,7 @@ impl PyPreTokenizedString { /// does not need to return anything, just calling the methods on the provided /// NormalizedString allow its modification. #[pyo3(text_signature = "(self, func)")] - fn normalize(&mut self, func: &PyAny) -> PyResult<()> { + fn normalize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { normalize(&mut self.pretok, func) } @@ -206,7 +206,7 @@ impl PyPreTokenizedString { /// The function used to tokenize each underlying split. This function must return /// a list of Token generated from the input str. #[pyo3(text_signature = "(self, func)")] - fn tokenize(&mut self, func: &PyAny) -> PyResult<()> { + fn tokenize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { tokenize(&mut self.pretok, func) } @@ -289,19 +289,19 @@ impl PyPreTokenizedStringRefMut { #[pymethods] impl PyPreTokenizedStringRefMut { - fn split(&mut self, func: &PyAny) -> PyResult<()> { + fn split(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { self.inner .map_mut(|pretok| split(pretok, func)) .ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)? } - fn normalize(&mut self, func: &PyAny) -> PyResult<()> { + fn normalize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { self.inner .map_mut(|pretok| normalize(pretok, func)) .ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)? } - fn tokenize(&mut self, func: &PyAny) -> PyResult<()> { + fn tokenize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> { self.inner .map_mut(|pretok| tokenize(pretok, func)) .ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)? diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index f948fe52..477ada53 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -40,19 +40,19 @@ harness = false lazy_static = "1.4" rand = "0.8" onig = { version = "6.4", default-features = false, optional = true } -regex = "1.9" +regex = "1.10" regex-syntax = "0.8" -rayon = "1.8" +rayon = "1.10" rayon-cond = "0.3" serde = { version = "1.0", features = [ "derive" ] } serde_json = "1.0" unicode-normalization-alignments = "0.1" unicode_categories = "0.1" -unicode-segmentation = "1.10" +unicode-segmentation = "1.11" indicatif = {version = "0.17", optional = true} itertools = "0.12" log = "0.4" -derive_builder = "0.13" +derive_builder = "0.20" spm_precompiled = "0.1" hf-hub = { version = "0.3.2", optional = true } aho-corasick = "1.1" @@ -62,7 +62,7 @@ thiserror = "1.0.49" fancy-regex = { version = "0.13", optional = true} getrandom = { version = "0.2.10" } esaxx-rs = { version = "0.1.10", default-features = false, features=[]} -monostate = "0.1.9" +monostate = "0.1.12" [features] default = ["progressbar", "onig", "esaxx_fast"] @@ -73,7 +73,7 @@ unstable_wasm = ["fancy-regex", "getrandom/js"] [dev-dependencies] criterion = "0.5" -tempfile = "3.8" +tempfile = "3.10" assert_approx_eq = "1.1" [profile.release]