From 3a6504d2740ef3892350ef074beffe4a1ac87a64 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 31 Dec 2024 18:36:01 +0100 Subject: [PATCH] Upgrade to PyO3 0.23 (#1708) * Upgrade to PyO3 0.23 * Macos-12 deprecated? * Clippy. * Clippy auto ellision. --- .github/workflows/CI.yml | 2 +- bindings/python/Cargo.toml | 8 +- bindings/python/pyproject.toml | 1 + bindings/python/src/decoders.rs | 73 ++++++++++++------ bindings/python/src/encoding.rs | 2 +- bindings/python/src/error.rs | 5 +- bindings/python/src/models.rs | 22 ++++-- bindings/python/src/normalizers.rs | 97 ++++++++++++++++-------- bindings/python/src/pre_tokenizers.rs | 99 ++++++++++++++++--------- bindings/python/src/processors.rs | 43 +++++++---- bindings/python/src/tokenizer.rs | 24 +++--- bindings/python/src/trainers.rs | 28 ++++--- bindings/python/src/utils/iterators.rs | 3 +- bindings/python/src/utils/serde_pyo3.rs | 16 ++-- tokenizers/Cargo.toml | 6 +- tokenizers/src/models/mod.rs | 2 +- tokenizers/src/tokenizer/encoding.rs | 2 +- tokenizers/src/tokenizer/mod.rs | 2 +- tokenizers/src/utils/fancy.rs | 2 +- 19 files changed, 283 insertions(+), 154 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 84da2199..2cc96438 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -117,7 +117,7 @@ jobs: strategy: matrix: platform: - - runner: macos-12 + - runner: macos-13 target: x86_64 - runner: macos-14 target: aarch64 diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 475b1fa2..e987716b 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -14,9 +14,9 @@ serde = { version = "1.0", features = ["rc", "derive"] } serde_json = "1.0" libc = "0.2" env_logger = "0.11" -pyo3 = { version = "0.22", features = ["abi3", "abi3-py39"] } -numpy = "0.22" -ndarray = "0.15" +pyo3 = { version = "0.23", features = ["abi3", "abi3-py39"] } +numpy = "0.23" +ndarray = "0.16" itertools = "0.12" [dependencies.tokenizers] @@ -24,7 +24,7 @@ path = "../../tokenizers" [dev-dependencies] tempfile = "3.10" -pyo3 = { version = "0.22", features = ["auto-initialize"] } +pyo3 = { version = "0.23", features = ["auto-initialize"] } [features] defaut = ["pyo3/extension-module"] diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 681619a2..234765f6 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -25,6 +25,7 @@ dynamic = [ 'description', 'license', 'readme', + 'version', ] dependencies = ["huggingface_hub>=0.16.4,<1.0"] diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 88e0a539..44f33326 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -43,22 +43,48 @@ impl PyDecoder { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult { let base = self.clone(); Ok(match &self.decoder { - PyDecoderWrapper::Custom(_) => Py::new(py, base)?.into_py(py), + PyDecoderWrapper::Custom(_) => Py::new(py, base)?.into_pyobject(py)?.into_any().into(), PyDecoderWrapper::Wrapped(inner) => match &*inner.as_ref().read().unwrap() { - DecoderWrapper::Metaspace(_) => Py::new(py, (PyMetaspaceDec {}, base))?.into_py(py), - DecoderWrapper::WordPiece(_) => Py::new(py, (PyWordPieceDec {}, base))?.into_py(py), - DecoderWrapper::ByteFallback(_) => { - Py::new(py, (PyByteFallbackDec {}, base))?.into_py(py) - } - DecoderWrapper::Strip(_) => Py::new(py, (PyStrip {}, base))?.into_py(py), - DecoderWrapper::Fuse(_) => Py::new(py, (PyFuseDec {}, base))?.into_py(py), - DecoderWrapper::ByteLevel(_) => Py::new(py, (PyByteLevelDec {}, base))?.into_py(py), - DecoderWrapper::Replace(_) => Py::new(py, (PyReplaceDec {}, base))?.into_py(py), - DecoderWrapper::BPE(_) => Py::new(py, (PyBPEDecoder {}, base))?.into_py(py), - DecoderWrapper::CTC(_) => Py::new(py, (PyCTCDecoder {}, base))?.into_py(py), - DecoderWrapper::Sequence(_) => { - Py::new(py, (PySequenceDecoder {}, base))?.into_py(py) - } + DecoderWrapper::Metaspace(_) => Py::new(py, (PyMetaspaceDec {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::WordPiece(_) => Py::new(py, (PyWordPieceDec {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::ByteFallback(_) => Py::new(py, (PyByteFallbackDec {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::Strip(_) => Py::new(py, (PyStrip {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::Fuse(_) => Py::new(py, (PyFuseDec {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::ByteLevel(_) => Py::new(py, (PyByteLevelDec {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::Replace(_) => Py::new(py, (PyReplaceDec {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::BPE(_) => Py::new(py, (PyBPEDecoder {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::CTC(_) => Py::new(py, (PyCTCDecoder {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + DecoderWrapper::Sequence(_) => Py::new(py, (PySequenceDecoder {}, base))? + .into_pyobject(py)? + .into_any() + .into(), }, }) } @@ -85,7 +111,7 @@ impl PyDecoder { e )) })?; - Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new(py, data.as_bytes()).into()) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -484,8 +510,8 @@ impl PySequenceDecoder { Ok((PySequenceDecoder {}, Sequence::new(decoders).into())) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - PyTuple::new_bound(py, [PyList::empty_bound(py)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + PyTuple::new(py, [PyList::empty(py)]) } } @@ -504,7 +530,7 @@ impl Decoder for CustomDecoder { Python::with_gil(|py| { let decoded = self .inner - .call_method_bound(py, "decode", (tokens,), None)? + .call_method(py, "decode", (tokens,), None)? .extract(py)?; Ok(decoded) }) @@ -514,7 +540,7 @@ impl Decoder for CustomDecoder { Python::with_gil(|py| { let decoded = self .inner - .call_method_bound(py, "decode_chain", (tokens,), None)? + .call_method(py, "decode_chain", (tokens,), None)? .extract(py)?; Ok(decoded) }) @@ -693,7 +719,12 @@ mod test { let obj = Python::with_gil(|py| { let py_msp = PyDecoder::new(Metaspace::default().into()); - let obj: PyObject = Py::new(py, py_msp).unwrap().into_py(py); + let obj: PyObject = Py::new(py, py_msp) + .unwrap() + .into_pyobject(py) + .unwrap() + .into_any() + .into(); obj }); let py_seq = PyDecoderWrapper::Custom(Arc::new(RwLock::new(CustomDecoder::new(obj)))); diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs index dcad1b03..e157b800 100644 --- a/bindings/python/src/encoding.rs +++ b/bindings/python/src/encoding.rs @@ -37,7 +37,7 @@ impl PyEncoding { e )) })?; - Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new(py, data.as_bytes()).into()) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { diff --git a/bindings/python/src/error.rs b/bindings/python/src/error.rs index 888c0d44..e6db1a6c 100644 --- a/bindings/python/src/error.rs +++ b/bindings/python/src/error.rs @@ -1,6 +1,7 @@ use pyo3::exceptions; use pyo3::prelude::*; use pyo3::type_object::PyTypeInfo; +use std::ffi::CString; use std::fmt::{Display, Formatter, Result as FmtResult}; use tokenizers::tokenizer::Result; @@ -35,7 +36,7 @@ impl ToPyResult { } pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> { - let deprecation_warning = py.import_bound("builtins")?.getattr("DeprecationWarning")?; + let deprecation_warning = py.import("builtins")?.getattr("DeprecationWarning")?; let full_message = format!("Deprecated in {}: {}", version, message); - pyo3::PyErr::warn_bound(py, &deprecation_warning, &full_message, 0) + pyo3::PyErr::warn(py, &deprecation_warning, &CString::new(full_message)?, 0) } diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 0d5c0ddc..2f4dba82 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -35,10 +35,22 @@ impl PyModel { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult { let base = self.clone(); Ok(match *self.model.as_ref().read().unwrap() { - ModelWrapper::BPE(_) => Py::new(py, (PyBPE {}, base))?.into_py(py), - ModelWrapper::WordPiece(_) => Py::new(py, (PyWordPiece {}, base))?.into_py(py), - ModelWrapper::WordLevel(_) => Py::new(py, (PyWordLevel {}, base))?.into_py(py), - ModelWrapper::Unigram(_) => Py::new(py, (PyUnigram {}, base))?.into_py(py), + ModelWrapper::BPE(_) => Py::new(py, (PyBPE {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + ModelWrapper::WordPiece(_) => Py::new(py, (PyWordPiece {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + ModelWrapper::WordLevel(_) => Py::new(py, (PyWordLevel {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + ModelWrapper::Unigram(_) => Py::new(py, (PyUnigram {}, base))? + .into_pyobject(py)? + .into_any() + .into(), }) } } @@ -105,7 +117,7 @@ impl PyModel { e )) })?; - Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new(py, data.as_bytes()).into()) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 38041fc9..d8159637 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -54,38 +54,73 @@ impl PyNormalizer { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult { let base = self.clone(); Ok(match self.normalizer { - PyNormalizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?.into_py(py), + PyNormalizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))? + .into_pyobject(py)? + .into_any() + .into(), PyNormalizerTypeWrapper::Single(ref inner) => match &*inner.as_ref().read().unwrap() { - PyNormalizerWrapper::Custom(_) => Py::new(py, base)?.into_py(py), + PyNormalizerWrapper::Custom(_) => { + Py::new(py, base)?.into_pyobject(py)?.into_any().into() + } PyNormalizerWrapper::Wrapped(ref inner) => match inner { - NormalizerWrapper::Sequence(_) => { - Py::new(py, (PySequence {}, base))?.into_py(py) - } + NormalizerWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))? + .into_pyobject(py)? + .into_any() + .into(), NormalizerWrapper::BertNormalizer(_) => { - Py::new(py, (PyBertNormalizer {}, base))?.into_py(py) + Py::new(py, (PyBertNormalizer {}, base))? + .into_pyobject(py)? + .into_any() + .into() } - NormalizerWrapper::StripNormalizer(_) => { - Py::new(py, (PyStrip {}, base))?.into_py(py) - } - NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))?.into_py(py), - NormalizerWrapper::ByteLevel(_) => { - Py::new(py, (PyByteLevel {}, base))?.into_py(py) - } - NormalizerWrapper::StripAccents(_) => { - Py::new(py, (PyStripAccents {}, base))?.into_py(py) - } - NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base))?.into_py(py), - NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base))?.into_py(py), - NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base))?.into_py(py), - NormalizerWrapper::NFKD(_) => Py::new(py, (PyNFKD {}, base))?.into_py(py), - NormalizerWrapper::Lowercase(_) => { - Py::new(py, (PyLowercase {}, base))?.into_py(py) - } - NormalizerWrapper::Precompiled(_) => { - Py::new(py, (PyPrecompiled {}, base))?.into_py(py) - } - NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base))?.into_py(py), - NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base))?.into_py(py), + NormalizerWrapper::StripNormalizer(_) => Py::new(py, (PyStrip {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::StripAccents(_) => Py::new(py, (PyStripAccents {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::NFKD(_) => Py::new(py, (PyNFKD {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::Lowercase(_) => Py::new(py, (PyLowercase {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::Precompiled(_) => Py::new(py, (PyPrecompiled {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base))? + .into_pyobject(py)? + .into_any() + .into(), }, }, }) @@ -114,7 +149,7 @@ impl PyNormalizer { e )) })?; - Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new(py, data.as_bytes()).into()) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -371,8 +406,8 @@ impl PySequence { )) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - PyTuple::new_bound(py, [PyList::empty_bound(py)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + PyTuple::new(py, [PyList::empty(py)]) } fn __len__(&self) -> usize { diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index e58d1bee..fdc86230 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -49,45 +49,69 @@ impl PyPreTokenizer { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult { let base = self.clone(); Ok(match &self.pretok { - PyPreTokenizerTypeWrapper::Sequence(_) => { - Py::new(py, (PySequence {}, base))?.into_py(py) - } + PyPreTokenizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))? + .into_pyobject(py)? + .into_any() + .into(), PyPreTokenizerTypeWrapper::Single(ref inner) => { match &*inner.as_ref().read().unwrap() { - PyPreTokenizerWrapper::Custom(_) => Py::new(py, base)?.into_py(py), + PyPreTokenizerWrapper::Custom(_) => { + Py::new(py, base)?.into_pyobject(py)?.into_any().into() + } PyPreTokenizerWrapper::Wrapped(inner) => match inner { - PreTokenizerWrapper::Whitespace(_) => { - Py::new(py, (PyWhitespace {}, base))?.into_py(py) - } - PreTokenizerWrapper::Split(_) => { - Py::new(py, (PySplit {}, base))?.into_py(py) - } + PreTokenizerWrapper::Whitespace(_) => Py::new(py, (PyWhitespace {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + PreTokenizerWrapper::Split(_) => Py::new(py, (PySplit {}, base))? + .into_pyobject(py)? + .into_any() + .into(), PreTokenizerWrapper::Punctuation(_) => { - Py::new(py, (PyPunctuation {}, base))?.into_py(py) - } - PreTokenizerWrapper::Sequence(_) => { - Py::new(py, (PySequence {}, base))?.into_py(py) - } - PreTokenizerWrapper::Metaspace(_) => { - Py::new(py, (PyMetaspace {}, base))?.into_py(py) + Py::new(py, (PyPunctuation {}, base))? + .into_pyobject(py)? + .into_any() + .into() } + PreTokenizerWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + PreTokenizerWrapper::Metaspace(_) => Py::new(py, (PyMetaspace {}, base))? + .into_pyobject(py)? + .into_any() + .into(), PreTokenizerWrapper::Delimiter(_) => { - Py::new(py, (PyCharDelimiterSplit {}, base))?.into_py(py) + Py::new(py, (PyCharDelimiterSplit {}, base))? + .into_pyobject(py)? + .into_any() + .into() } PreTokenizerWrapper::WhitespaceSplit(_) => { - Py::new(py, (PyWhitespaceSplit {}, base))?.into_py(py) - } - PreTokenizerWrapper::ByteLevel(_) => { - Py::new(py, (PyByteLevel {}, base))?.into_py(py) + Py::new(py, (PyWhitespaceSplit {}, base))? + .into_pyobject(py)? + .into_any() + .into() } + PreTokenizerWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))? + .into_pyobject(py)? + .into_any() + .into(), PreTokenizerWrapper::BertPreTokenizer(_) => { - Py::new(py, (PyBertPreTokenizer {}, base))?.into_py(py) - } - PreTokenizerWrapper::Digits(_) => { - Py::new(py, (PyDigits {}, base))?.into_py(py) + Py::new(py, (PyBertPreTokenizer {}, base))? + .into_pyobject(py)? + .into_any() + .into() } + PreTokenizerWrapper::Digits(_) => Py::new(py, (PyDigits {}, base))? + .into_pyobject(py)? + .into_any() + .into(), PreTokenizerWrapper::UnicodeScripts(_) => { - Py::new(py, (PyUnicodeScripts {}, base))?.into_py(py) + Py::new(py, (PyUnicodeScripts {}, base))? + .into_pyobject(py)? + .into_any() + .into() } }, } @@ -118,7 +142,7 @@ impl PyPreTokenizer { e )) })?; - Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new(py, data.as_bytes()).into()) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -365,8 +389,8 @@ impl PySplit { )) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - PyTuple::new_bound(py, [" ", "removed"]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + PyTuple::new(py, [" ", "removed"]) } } @@ -398,8 +422,8 @@ impl PyCharDelimiterSplit { )) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - PyTuple::new_bound(py, [" "]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + PyTuple::new(py, [" "]) } } @@ -460,8 +484,8 @@ impl PySequence { )) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - PyTuple::new_bound(py, [PyList::empty_bound(py)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + PyTuple::new(py, [PyList::empty(py)]) } fn __getitem__(self_: PyRef<'_, Self>, py: Python<'_>, index: usize) -> PyResult> { @@ -823,7 +847,12 @@ mod test { let obj = Python::with_gil(|py| { let py_wsp = PyPreTokenizer::new(Whitespace {}.into()); - let obj: PyObject = Py::new(py, py_wsp).unwrap().into_py(py); + let obj: PyObject = Py::new(py, py_wsp) + .unwrap() + .into_pyobject(py) + .unwrap() + .into_any() + .into(); obj }); let py_seq: PyPreTokenizerWrapper = diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 1e7520aa..d558c40b 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -41,15 +41,26 @@ impl PyPostProcessor { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult { let base = self.clone(); Ok(match self.processor.as_ref() { - PostProcessorWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?.into_py(py), - PostProcessorWrapper::Bert(_) => Py::new(py, (PyBertProcessing {}, base))?.into_py(py), - PostProcessorWrapper::Roberta(_) => { - Py::new(py, (PyRobertaProcessing {}, base))?.into_py(py) - } - PostProcessorWrapper::Template(_) => { - Py::new(py, (PyTemplateProcessing {}, base))?.into_py(py) - } - PostProcessorWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?.into_py(py), + PostProcessorWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + PostProcessorWrapper::Bert(_) => Py::new(py, (PyBertProcessing {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + PostProcessorWrapper::Roberta(_) => Py::new(py, (PyRobertaProcessing {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + PostProcessorWrapper::Template(_) => Py::new(py, (PyTemplateProcessing {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + PostProcessorWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))? + .into_pyobject(py)? + .into_any() + .into(), }) } } @@ -78,7 +89,7 @@ impl PyPostProcessor { e )) })?; - Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new(py, data.as_bytes()).into()) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -176,8 +187,8 @@ impl PyBertProcessing { ) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - PyTuple::new_bound(py, [("", 0), ("", 0)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + PyTuple::new(py, [("", 0), ("", 0)]) } } @@ -226,8 +237,8 @@ impl PyRobertaProcessing { ) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - PyTuple::new_bound(py, [("", 0), ("", 0)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + PyTuple::new(py, [("", 0), ("", 0)]) } } @@ -451,8 +462,8 @@ impl PySequence { ) } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - PyTuple::new_bound(py, [PyList::empty_bound(py)]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + PyTuple::new(py, [PyList::empty(py)]) } } diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 52b86d97..73a0dbbe 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -100,7 +100,7 @@ impl PyAddedToken { } pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult> { - let dict = PyDict::new_bound(py); + let dict = PyDict::new(py); let token = self.get_token(); dict.set_item("content", token.content)?; @@ -347,6 +347,7 @@ impl From for tk::InputSequence<'_> { } struct PyArrayStr(Vec); + impl FromPyObject<'_> for PyArrayStr { fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult { let array = ob.downcast::>()?; @@ -495,7 +496,7 @@ impl PyTokenizer { e )) })?; - Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new(py, data.as_bytes()).into()) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { @@ -513,9 +514,12 @@ impl PyTokenizer { } } - fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { - let model = PyModel::from(BPE::default()).into_py(py); - PyTuple::new_bound(py, vec![model]) + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult> { + let model: PyObject = PyModel::from(BPE::default()) + .into_pyobject(py)? + .into_any() + .into(); + PyTuple::new(py, vec![model]) } /// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string. @@ -594,14 +598,14 @@ impl PyTokenizer { token: Option, ) -> PyResult { let path = Python::with_gil(|py| -> PyResult { - let huggingface_hub = PyModule::import_bound(py, intern!(py, "huggingface_hub"))?; + let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?; let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?; let kwargs = [ (intern!(py, "repo_id"), identifier), (intern!(py, "filename"), "tokenizer.json"), (intern!(py, "revision"), &revision), ] - .into_py_dict_bound(py); + .into_py_dict(py)?; if let Some(token) = token { kwargs.set_item(intern!(py, "token"), token)?; } @@ -796,7 +800,7 @@ impl PyTokenizer { #[getter] fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult>> { self.tokenizer.get_truncation().map_or(Ok(None), |params| { - let dict = PyDict::new_bound(py); + let dict = PyDict::new(py); dict.set_item("max_length", params.max_length)?; dict.set_item("stride", params.stride)?; @@ -906,7 +910,7 @@ impl PyTokenizer { #[getter] fn get_padding<'py>(&self, py: Python<'py>) -> PyResult>> { self.tokenizer.get_padding().map_or(Ok(None), |params| { - let dict = PyDict::new_bound(py); + let dict = PyDict::new(py); dict.set_item( "length", @@ -1342,7 +1346,7 @@ impl PyTokenizer { if let Ok(s) = element.downcast::() { itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned()))) } else { - match element.iter() { + match element.try_iter() { Ok(iter) => itertools::Either::Left( iter.map(|i| i?.extract::()) .collect::>() diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index 45eabf0d..a3d2d556 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -29,16 +29,22 @@ impl PyTrainer { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult { let base = self.clone(); Ok(match *self.trainer.as_ref().read().unwrap() { - TrainerWrapper::BpeTrainer(_) => Py::new(py, (PyBpeTrainer {}, base))?.into_py(py), - TrainerWrapper::WordPieceTrainer(_) => { - Py::new(py, (PyWordPieceTrainer {}, base))?.into_py(py) - } - TrainerWrapper::WordLevelTrainer(_) => { - Py::new(py, (PyWordLevelTrainer {}, base))?.into_py(py) - } - TrainerWrapper::UnigramTrainer(_) => { - Py::new(py, (PyUnigramTrainer {}, base))?.into_py(py) - } + TrainerWrapper::BpeTrainer(_) => Py::new(py, (PyBpeTrainer {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + TrainerWrapper::WordPieceTrainer(_) => Py::new(py, (PyWordPieceTrainer {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + TrainerWrapper::WordLevelTrainer(_) => Py::new(py, (PyWordLevelTrainer {}, base))? + .into_pyobject(py)? + .into_any() + .into(), + TrainerWrapper::UnigramTrainer(_) => Py::new(py, (PyUnigramTrainer {}, base))? + .into_pyobject(py)? + .into_any() + .into(), }) } } @@ -51,7 +57,7 @@ impl PyTrainer { e )) })?; - Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) + Ok(PyBytes::new(py, data.as_bytes()).into()) } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { diff --git a/bindings/python/src/utils/iterators.rs b/bindings/python/src/utils/iterators.rs index ebb9ac36..d619b93d 100644 --- a/bindings/python/src/utils/iterators.rs +++ b/bindings/python/src/utils/iterators.rs @@ -65,8 +65,7 @@ where pub fn new(iter: &Bound<'_, PyAny>, converter: F, buffer_size: usize) -> PyResult { let py = iter.py(); let iter: Py = unsafe { - Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))? - .to_object(py) + Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))?.into() }; Ok(Self { diff --git a/bindings/python/src/utils/serde_pyo3.rs b/bindings/python/src/utils/serde_pyo3.rs index 47199361..d4f8f132 100644 --- a/bindings/python/src/utils/serde_pyo3.rs +++ b/bindings/python/src/utils/serde_pyo3.rs @@ -57,7 +57,7 @@ where Ok(serializer.output) } -impl<'a> ser::Serializer for &'a mut Serializer { +impl ser::Serializer for &mut Serializer { // The output type produced by this `Serializer` during successful // serialization. Most serializers that produce text or binary output should // set `Ok = ()` and serialize into an `io::Write` or buffer contained @@ -355,7 +355,7 @@ impl<'a> ser::Serializer for &'a mut Serializer { // // This impl is SerializeSeq so these methods are called after `serialize_seq` // is called on the Serializer. -impl<'a> ser::SerializeSeq for &'a mut Serializer { +impl ser::SerializeSeq for &mut Serializer { // Must match the `Ok` type of the serializer. type Ok = (); // Must match the `Error` type of the serializer. @@ -391,7 +391,7 @@ impl<'a> ser::SerializeSeq for &'a mut Serializer { } // Same thing but for tuples. -impl<'a> ser::SerializeTuple for &'a mut Serializer { +impl ser::SerializeTuple for &mut Serializer { type Ok = (); type Error = Error; @@ -423,7 +423,7 @@ impl<'a> ser::SerializeTuple for &'a mut Serializer { } // Same thing but for tuple structs. -impl<'a> ser::SerializeTupleStruct for &'a mut Serializer { +impl ser::SerializeTupleStruct for &mut Serializer { type Ok = (); type Error = Error; @@ -463,7 +463,7 @@ impl<'a> ser::SerializeTupleStruct for &'a mut Serializer { // // So the `end` method in this impl is responsible for closing both the `]` and // the `}`. -impl<'a> ser::SerializeTupleVariant for &'a mut Serializer { +impl ser::SerializeTupleVariant for &mut Serializer { type Ok = (); type Error = Error; @@ -502,7 +502,7 @@ impl<'a> ser::SerializeTupleVariant for &'a mut Serializer { // `serialize_entry` method allows serializers to optimize for the case where // key and value are both available simultaneously. In JSON it doesn't make a // difference so the default behavior for `serialize_entry` is fine. -impl<'a> ser::SerializeMap for &'a mut Serializer { +impl ser::SerializeMap for &mut Serializer { type Ok = (); type Error = Error; @@ -559,7 +559,7 @@ impl<'a> ser::SerializeMap for &'a mut Serializer { // Structs are like maps in which the keys are constrained to be compile-time // constant strings. -impl<'a> ser::SerializeStruct for &'a mut Serializer { +impl ser::SerializeStruct for &mut Serializer { type Ok = (); type Error = Error; @@ -590,7 +590,7 @@ impl<'a> ser::SerializeStruct for &'a mut Serializer { // Similar to `SerializeTupleVariant`, here the `end` method is responsible for // closing both of the curly braces opened by `serialize_struct_variant`. -impl<'a> ser::SerializeStructVariant for &'a mut Serializer { +impl ser::SerializeStructVariant for &mut Serializer { type Ok = (); type Error = Error; diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index cf3db78b..dacb9629 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -55,7 +55,7 @@ unicode-normalization-alignments = "0.1" unicode_categories = "0.1" unicode-segmentation = "1.11" indicatif = {version = "0.17", optional = true} -itertools = "0.12" +itertools = "0.13" log = "0.4" derive_builder = "0.20" spm_precompiled = "0.1.3" @@ -63,8 +63,8 @@ hf-hub = { version = "0.3.2", optional = true } aho-corasick = "1.1" paste = "1.0.14" macro_rules_attribute = "0.2.0" -thiserror = "1.0.49" -fancy-regex = { version = "0.13", optional = true} +thiserror = "2" +fancy-regex = { version = "0.14", optional = true} getrandom = { version = "0.2.10" } esaxx-rs = { version = "0.1.10", default-features = false, features=[]} monostate = "0.1.12" diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs index 3ab3b495..3a3a91ad 100644 --- a/tokenizers/src/models/mod.rs +++ b/tokenizers/src/models/mod.rs @@ -28,7 +28,7 @@ impl<'a> OrderedVocabIter<'a> { } } -impl<'a> Serialize for OrderedVocabIter<'a> { +impl Serialize for OrderedVocabIter<'_> { fn serialize(&self, serializer: S) -> std::result::Result where S: Serializer, diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs index 0693ad1e..1732686e 100644 --- a/tokenizers/src/tokenizer/encoding.rs +++ b/tokenizers/src/tokenizer/encoding.rs @@ -341,7 +341,7 @@ impl Encoding { .step_by(offset) .filter_map(|stop| { let stop = stop + 1; - let start = if stop < max_len { 0 } else { stop - max_len }; + let start = stop.saturating_sub(max_len); if start < stop && !end { end = start == 0; Some((start, stop)) diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index cc095c1f..893d2743 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -233,7 +233,7 @@ impl<'s> From<&'s [String]> for InputSequence<'s> { } } -impl<'s> From> for InputSequence<'s> { +impl From> for InputSequence<'_> { fn from(input: Vec) -> Self { Self::PreTokenizedOwned(Cow::Owned(input)) } diff --git a/tokenizers/src/utils/fancy.rs b/tokenizers/src/utils/fancy.rs index 9d44bc74..bbcf6531 100644 --- a/tokenizers/src/utils/fancy.rs +++ b/tokenizers/src/utils/fancy.rs @@ -22,7 +22,7 @@ impl SysRegex { pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>); -impl<'r, 't> Iterator for Matches<'r, 't> { +impl Iterator for Matches<'_, '_> { type Item = (usize, usize); fn next(&mut self) -> Option {