Upgrade to PyO3 0.23 (#1708)

* Upgrade to PyO3 0.23

* Macos-12 deprecated?

* Clippy.

* Clippy auto ellision.
This commit is contained in:
Nicolas Patry
2024-12-31 18:36:01 +01:00
committed by GitHub
parent 555d44c47a
commit 3a6504d274
19 changed files with 283 additions and 154 deletions

View File

@ -117,7 +117,7 @@ jobs:
strategy: strategy:
matrix: matrix:
platform: platform:
- runner: macos-12 - runner: macos-13
target: x86_64 target: x86_64
- runner: macos-14 - runner: macos-14
target: aarch64 target: aarch64

View File

@ -14,9 +14,9 @@ serde = { version = "1.0", features = ["rc", "derive"] }
serde_json = "1.0" serde_json = "1.0"
libc = "0.2" libc = "0.2"
env_logger = "0.11" env_logger = "0.11"
pyo3 = { version = "0.22", features = ["abi3", "abi3-py39"] } pyo3 = { version = "0.23", features = ["abi3", "abi3-py39"] }
numpy = "0.22" numpy = "0.23"
ndarray = "0.15" ndarray = "0.16"
itertools = "0.12" itertools = "0.12"
[dependencies.tokenizers] [dependencies.tokenizers]
@ -24,7 +24,7 @@ path = "../../tokenizers"
[dev-dependencies] [dev-dependencies]
tempfile = "3.10" tempfile = "3.10"
pyo3 = { version = "0.22", features = ["auto-initialize"] } pyo3 = { version = "0.23", features = ["auto-initialize"] }
[features] [features]
defaut = ["pyo3/extension-module"] defaut = ["pyo3/extension-module"]

View File

@ -25,6 +25,7 @@ dynamic = [
'description', 'description',
'license', 'license',
'readme', 'readme',
'version',
] ]
dependencies = ["huggingface_hub>=0.16.4,<1.0"] dependencies = ["huggingface_hub>=0.16.4,<1.0"]

View File

@ -43,22 +43,48 @@ impl PyDecoder {
pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
let base = self.clone(); let base = self.clone();
Ok(match &self.decoder { Ok(match &self.decoder {
PyDecoderWrapper::Custom(_) => Py::new(py, base)?.into_py(py), PyDecoderWrapper::Custom(_) => Py::new(py, base)?.into_pyobject(py)?.into_any().into(),
PyDecoderWrapper::Wrapped(inner) => match &*inner.as_ref().read().unwrap() { PyDecoderWrapper::Wrapped(inner) => match &*inner.as_ref().read().unwrap() {
DecoderWrapper::Metaspace(_) => Py::new(py, (PyMetaspaceDec {}, base))?.into_py(py), DecoderWrapper::Metaspace(_) => Py::new(py, (PyMetaspaceDec {}, base))?
DecoderWrapper::WordPiece(_) => Py::new(py, (PyWordPieceDec {}, base))?.into_py(py), .into_pyobject(py)?
DecoderWrapper::ByteFallback(_) => { .into_any()
Py::new(py, (PyByteFallbackDec {}, base))?.into_py(py) .into(),
} DecoderWrapper::WordPiece(_) => Py::new(py, (PyWordPieceDec {}, base))?
DecoderWrapper::Strip(_) => Py::new(py, (PyStrip {}, base))?.into_py(py), .into_pyobject(py)?
DecoderWrapper::Fuse(_) => Py::new(py, (PyFuseDec {}, base))?.into_py(py), .into_any()
DecoderWrapper::ByteLevel(_) => Py::new(py, (PyByteLevelDec {}, base))?.into_py(py), .into(),
DecoderWrapper::Replace(_) => Py::new(py, (PyReplaceDec {}, base))?.into_py(py), DecoderWrapper::ByteFallback(_) => Py::new(py, (PyByteFallbackDec {}, base))?
DecoderWrapper::BPE(_) => Py::new(py, (PyBPEDecoder {}, base))?.into_py(py), .into_pyobject(py)?
DecoderWrapper::CTC(_) => Py::new(py, (PyCTCDecoder {}, base))?.into_py(py), .into_any()
DecoderWrapper::Sequence(_) => { .into(),
Py::new(py, (PySequenceDecoder {}, base))?.into_py(py) DecoderWrapper::Strip(_) => Py::new(py, (PyStrip {}, base))?
} .into_pyobject(py)?
.into_any()
.into(),
DecoderWrapper::Fuse(_) => Py::new(py, (PyFuseDec {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
DecoderWrapper::ByteLevel(_) => Py::new(py, (PyByteLevelDec {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
DecoderWrapper::Replace(_) => Py::new(py, (PyReplaceDec {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
DecoderWrapper::BPE(_) => Py::new(py, (PyBPEDecoder {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
DecoderWrapper::CTC(_) => Py::new(py, (PyCTCDecoder {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
DecoderWrapper::Sequence(_) => Py::new(py, (PySequenceDecoder {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
}, },
}) })
} }
@ -85,7 +111,7 @@ impl PyDecoder {
e e
)) ))
})?; })?;
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) Ok(PyBytes::new(py, data.as_bytes()).into())
} }
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@ -484,8 +510,8 @@ impl PySequenceDecoder {
Ok((PySequenceDecoder {}, Sequence::new(decoders).into())) Ok((PySequenceDecoder {}, Sequence::new(decoders).into()))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
PyTuple::new_bound(py, [PyList::empty_bound(py)]) PyTuple::new(py, [PyList::empty(py)])
} }
} }
@ -504,7 +530,7 @@ impl Decoder for CustomDecoder {
Python::with_gil(|py| { Python::with_gil(|py| {
let decoded = self let decoded = self
.inner .inner
.call_method_bound(py, "decode", (tokens,), None)? .call_method(py, "decode", (tokens,), None)?
.extract(py)?; .extract(py)?;
Ok(decoded) Ok(decoded)
}) })
@ -514,7 +540,7 @@ impl Decoder for CustomDecoder {
Python::with_gil(|py| { Python::with_gil(|py| {
let decoded = self let decoded = self
.inner .inner
.call_method_bound(py, "decode_chain", (tokens,), None)? .call_method(py, "decode_chain", (tokens,), None)?
.extract(py)?; .extract(py)?;
Ok(decoded) Ok(decoded)
}) })
@ -693,7 +719,12 @@ mod test {
let obj = Python::with_gil(|py| { let obj = Python::with_gil(|py| {
let py_msp = PyDecoder::new(Metaspace::default().into()); let py_msp = PyDecoder::new(Metaspace::default().into());
let obj: PyObject = Py::new(py, py_msp).unwrap().into_py(py); let obj: PyObject = Py::new(py, py_msp)
.unwrap()
.into_pyobject(py)
.unwrap()
.into_any()
.into();
obj obj
}); });
let py_seq = PyDecoderWrapper::Custom(Arc::new(RwLock::new(CustomDecoder::new(obj)))); let py_seq = PyDecoderWrapper::Custom(Arc::new(RwLock::new(CustomDecoder::new(obj))));

View File

@ -37,7 +37,7 @@ impl PyEncoding {
e e
)) ))
})?; })?;
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) Ok(PyBytes::new(py, data.as_bytes()).into())
} }
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {

View File

@ -1,6 +1,7 @@
use pyo3::exceptions; use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use pyo3::type_object::PyTypeInfo; use pyo3::type_object::PyTypeInfo;
use std::ffi::CString;
use std::fmt::{Display, Formatter, Result as FmtResult}; use std::fmt::{Display, Formatter, Result as FmtResult};
use tokenizers::tokenizer::Result; use tokenizers::tokenizer::Result;
@ -35,7 +36,7 @@ impl<T> ToPyResult<T> {
} }
pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> { pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> {
let deprecation_warning = py.import_bound("builtins")?.getattr("DeprecationWarning")?; let deprecation_warning = py.import("builtins")?.getattr("DeprecationWarning")?;
let full_message = format!("Deprecated in {}: {}", version, message); let full_message = format!("Deprecated in {}: {}", version, message);
pyo3::PyErr::warn_bound(py, &deprecation_warning, &full_message, 0) pyo3::PyErr::warn(py, &deprecation_warning, &CString::new(full_message)?, 0)
} }

View File

@ -35,10 +35,22 @@ impl PyModel {
pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
let base = self.clone(); let base = self.clone();
Ok(match *self.model.as_ref().read().unwrap() { Ok(match *self.model.as_ref().read().unwrap() {
ModelWrapper::BPE(_) => Py::new(py, (PyBPE {}, base))?.into_py(py), ModelWrapper::BPE(_) => Py::new(py, (PyBPE {}, base))?
ModelWrapper::WordPiece(_) => Py::new(py, (PyWordPiece {}, base))?.into_py(py), .into_pyobject(py)?
ModelWrapper::WordLevel(_) => Py::new(py, (PyWordLevel {}, base))?.into_py(py), .into_any()
ModelWrapper::Unigram(_) => Py::new(py, (PyUnigram {}, base))?.into_py(py), .into(),
ModelWrapper::WordPiece(_) => Py::new(py, (PyWordPiece {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
ModelWrapper::WordLevel(_) => Py::new(py, (PyWordLevel {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
ModelWrapper::Unigram(_) => Py::new(py, (PyUnigram {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
}) })
} }
} }
@ -105,7 +117,7 @@ impl PyModel {
e e
)) ))
})?; })?;
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) Ok(PyBytes::new(py, data.as_bytes()).into())
} }
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {

View File

@ -54,38 +54,73 @@ impl PyNormalizer {
pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
let base = self.clone(); let base = self.clone();
Ok(match self.normalizer { Ok(match self.normalizer {
PyNormalizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?.into_py(py), PyNormalizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
PyNormalizerTypeWrapper::Single(ref inner) => match &*inner.as_ref().read().unwrap() { PyNormalizerTypeWrapper::Single(ref inner) => match &*inner.as_ref().read().unwrap() {
PyNormalizerWrapper::Custom(_) => Py::new(py, base)?.into_py(py), PyNormalizerWrapper::Custom(_) => {
Py::new(py, base)?.into_pyobject(py)?.into_any().into()
}
PyNormalizerWrapper::Wrapped(ref inner) => match inner { PyNormalizerWrapper::Wrapped(ref inner) => match inner {
NormalizerWrapper::Sequence(_) => { NormalizerWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
Py::new(py, (PySequence {}, base))?.into_py(py) .into_pyobject(py)?
} .into_any()
.into(),
NormalizerWrapper::BertNormalizer(_) => { NormalizerWrapper::BertNormalizer(_) => {
Py::new(py, (PyBertNormalizer {}, base))?.into_py(py) Py::new(py, (PyBertNormalizer {}, base))?
.into_pyobject(py)?
.into_any()
.into()
} }
NormalizerWrapper::StripNormalizer(_) => { NormalizerWrapper::StripNormalizer(_) => Py::new(py, (PyStrip {}, base))?
Py::new(py, (PyStrip {}, base))?.into_py(py) .into_pyobject(py)?
} .into_any()
NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))?.into_py(py), .into(),
NormalizerWrapper::ByteLevel(_) => { NormalizerWrapper::Prepend(_) => Py::new(py, (PyPrepend {}, base))?
Py::new(py, (PyByteLevel {}, base))?.into_py(py) .into_pyobject(py)?
} .into_any()
NormalizerWrapper::StripAccents(_) => { .into(),
Py::new(py, (PyStripAccents {}, base))?.into_py(py) NormalizerWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?
} .into_pyobject(py)?
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base))?.into_py(py), .into_any()
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base))?.into_py(py), .into(),
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base))?.into_py(py), NormalizerWrapper::StripAccents(_) => Py::new(py, (PyStripAccents {}, base))?
NormalizerWrapper::NFKD(_) => Py::new(py, (PyNFKD {}, base))?.into_py(py), .into_pyobject(py)?
NormalizerWrapper::Lowercase(_) => { .into_any()
Py::new(py, (PyLowercase {}, base))?.into_py(py) .into(),
} NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base))?
NormalizerWrapper::Precompiled(_) => { .into_pyobject(py)?
Py::new(py, (PyPrecompiled {}, base))?.into_py(py) .into_any()
} .into(),
NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base))?.into_py(py), NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base))?
NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base))?.into_py(py), .into_pyobject(py)?
.into_any()
.into(),
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
NormalizerWrapper::NFKD(_) => Py::new(py, (PyNFKD {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
NormalizerWrapper::Lowercase(_) => Py::new(py, (PyLowercase {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
NormalizerWrapper::Precompiled(_) => Py::new(py, (PyPrecompiled {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
NormalizerWrapper::Replace(_) => Py::new(py, (PyReplace {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
NormalizerWrapper::Nmt(_) => Py::new(py, (PyNmt {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
}, },
}, },
}) })
@ -114,7 +149,7 @@ impl PyNormalizer {
e e
)) ))
})?; })?;
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) Ok(PyBytes::new(py, data.as_bytes()).into())
} }
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@ -371,8 +406,8 @@ impl PySequence {
)) ))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
PyTuple::new_bound(py, [PyList::empty_bound(py)]) PyTuple::new(py, [PyList::empty(py)])
} }
fn __len__(&self) -> usize { fn __len__(&self) -> usize {

View File

@ -49,45 +49,69 @@ impl PyPreTokenizer {
pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
let base = self.clone(); let base = self.clone();
Ok(match &self.pretok { Ok(match &self.pretok {
PyPreTokenizerTypeWrapper::Sequence(_) => { PyPreTokenizerTypeWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
Py::new(py, (PySequence {}, base))?.into_py(py) .into_pyobject(py)?
} .into_any()
.into(),
PyPreTokenizerTypeWrapper::Single(ref inner) => { PyPreTokenizerTypeWrapper::Single(ref inner) => {
match &*inner.as_ref().read().unwrap() { match &*inner.as_ref().read().unwrap() {
PyPreTokenizerWrapper::Custom(_) => Py::new(py, base)?.into_py(py), PyPreTokenizerWrapper::Custom(_) => {
Py::new(py, base)?.into_pyobject(py)?.into_any().into()
}
PyPreTokenizerWrapper::Wrapped(inner) => match inner { PyPreTokenizerWrapper::Wrapped(inner) => match inner {
PreTokenizerWrapper::Whitespace(_) => { PreTokenizerWrapper::Whitespace(_) => Py::new(py, (PyWhitespace {}, base))?
Py::new(py, (PyWhitespace {}, base))?.into_py(py) .into_pyobject(py)?
} .into_any()
PreTokenizerWrapper::Split(_) => { .into(),
Py::new(py, (PySplit {}, base))?.into_py(py) PreTokenizerWrapper::Split(_) => Py::new(py, (PySplit {}, base))?
} .into_pyobject(py)?
.into_any()
.into(),
PreTokenizerWrapper::Punctuation(_) => { PreTokenizerWrapper::Punctuation(_) => {
Py::new(py, (PyPunctuation {}, base))?.into_py(py) Py::new(py, (PyPunctuation {}, base))?
} .into_pyobject(py)?
PreTokenizerWrapper::Sequence(_) => { .into_any()
Py::new(py, (PySequence {}, base))?.into_py(py) .into()
}
PreTokenizerWrapper::Metaspace(_) => {
Py::new(py, (PyMetaspace {}, base))?.into_py(py)
} }
PreTokenizerWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
PreTokenizerWrapper::Metaspace(_) => Py::new(py, (PyMetaspace {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
PreTokenizerWrapper::Delimiter(_) => { PreTokenizerWrapper::Delimiter(_) => {
Py::new(py, (PyCharDelimiterSplit {}, base))?.into_py(py) Py::new(py, (PyCharDelimiterSplit {}, base))?
.into_pyobject(py)?
.into_any()
.into()
} }
PreTokenizerWrapper::WhitespaceSplit(_) => { PreTokenizerWrapper::WhitespaceSplit(_) => {
Py::new(py, (PyWhitespaceSplit {}, base))?.into_py(py) Py::new(py, (PyWhitespaceSplit {}, base))?
} .into_pyobject(py)?
PreTokenizerWrapper::ByteLevel(_) => { .into_any()
Py::new(py, (PyByteLevel {}, base))?.into_py(py) .into()
} }
PreTokenizerWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
PreTokenizerWrapper::BertPreTokenizer(_) => { PreTokenizerWrapper::BertPreTokenizer(_) => {
Py::new(py, (PyBertPreTokenizer {}, base))?.into_py(py) Py::new(py, (PyBertPreTokenizer {}, base))?
} .into_pyobject(py)?
PreTokenizerWrapper::Digits(_) => { .into_any()
Py::new(py, (PyDigits {}, base))?.into_py(py) .into()
} }
PreTokenizerWrapper::Digits(_) => Py::new(py, (PyDigits {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
PreTokenizerWrapper::UnicodeScripts(_) => { PreTokenizerWrapper::UnicodeScripts(_) => {
Py::new(py, (PyUnicodeScripts {}, base))?.into_py(py) Py::new(py, (PyUnicodeScripts {}, base))?
.into_pyobject(py)?
.into_any()
.into()
} }
}, },
} }
@ -118,7 +142,7 @@ impl PyPreTokenizer {
e e
)) ))
})?; })?;
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) Ok(PyBytes::new(py, data.as_bytes()).into())
} }
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@ -365,8 +389,8 @@ impl PySplit {
)) ))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
PyTuple::new_bound(py, [" ", "removed"]) PyTuple::new(py, [" ", "removed"])
} }
} }
@ -398,8 +422,8 @@ impl PyCharDelimiterSplit {
)) ))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
PyTuple::new_bound(py, [" "]) PyTuple::new(py, [" "])
} }
} }
@ -460,8 +484,8 @@ impl PySequence {
)) ))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
PyTuple::new_bound(py, [PyList::empty_bound(py)]) PyTuple::new(py, [PyList::empty(py)])
} }
fn __getitem__(self_: PyRef<'_, Self>, py: Python<'_>, index: usize) -> PyResult<Py<PyAny>> { fn __getitem__(self_: PyRef<'_, Self>, py: Python<'_>, index: usize) -> PyResult<Py<PyAny>> {
@ -823,7 +847,12 @@ mod test {
let obj = Python::with_gil(|py| { let obj = Python::with_gil(|py| {
let py_wsp = PyPreTokenizer::new(Whitespace {}.into()); let py_wsp = PyPreTokenizer::new(Whitespace {}.into());
let obj: PyObject = Py::new(py, py_wsp).unwrap().into_py(py); let obj: PyObject = Py::new(py, py_wsp)
.unwrap()
.into_pyobject(py)
.unwrap()
.into_any()
.into();
obj obj
}); });
let py_seq: PyPreTokenizerWrapper = let py_seq: PyPreTokenizerWrapper =

View File

@ -41,15 +41,26 @@ impl PyPostProcessor {
pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
let base = self.clone(); let base = self.clone();
Ok(match self.processor.as_ref() { Ok(match self.processor.as_ref() {
PostProcessorWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?.into_py(py), PostProcessorWrapper::ByteLevel(_) => Py::new(py, (PyByteLevel {}, base))?
PostProcessorWrapper::Bert(_) => Py::new(py, (PyBertProcessing {}, base))?.into_py(py), .into_pyobject(py)?
PostProcessorWrapper::Roberta(_) => { .into_any()
Py::new(py, (PyRobertaProcessing {}, base))?.into_py(py) .into(),
} PostProcessorWrapper::Bert(_) => Py::new(py, (PyBertProcessing {}, base))?
PostProcessorWrapper::Template(_) => { .into_pyobject(py)?
Py::new(py, (PyTemplateProcessing {}, base))?.into_py(py) .into_any()
} .into(),
PostProcessorWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?.into_py(py), PostProcessorWrapper::Roberta(_) => Py::new(py, (PyRobertaProcessing {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
PostProcessorWrapper::Template(_) => Py::new(py, (PyTemplateProcessing {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
PostProcessorWrapper::Sequence(_) => Py::new(py, (PySequence {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
}) })
} }
} }
@ -78,7 +89,7 @@ impl PyPostProcessor {
e e
)) ))
})?; })?;
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) Ok(PyBytes::new(py, data.as_bytes()).into())
} }
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@ -176,8 +187,8 @@ impl PyBertProcessing {
) )
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
PyTuple::new_bound(py, [("", 0), ("", 0)]) PyTuple::new(py, [("", 0), ("", 0)])
} }
} }
@ -226,8 +237,8 @@ impl PyRobertaProcessing {
) )
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
PyTuple::new_bound(py, [("", 0), ("", 0)]) PyTuple::new(py, [("", 0), ("", 0)])
} }
} }
@ -451,8 +462,8 @@ impl PySequence {
) )
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
PyTuple::new_bound(py, [PyList::empty_bound(py)]) PyTuple::new(py, [PyList::empty(py)])
} }
} }

View File

@ -100,7 +100,7 @@ impl PyAddedToken {
} }
pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> { pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
let dict = PyDict::new_bound(py); let dict = PyDict::new(py);
let token = self.get_token(); let token = self.get_token();
dict.set_item("content", token.content)?; dict.set_item("content", token.content)?;
@ -347,6 +347,7 @@ impl From<PyArrayUnicode> for tk::InputSequence<'_> {
} }
struct PyArrayStr(Vec<String>); struct PyArrayStr(Vec<String>);
impl FromPyObject<'_> for PyArrayStr { impl FromPyObject<'_> for PyArrayStr {
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> { fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
let array = ob.downcast::<PyArray1<PyObject>>()?; let array = ob.downcast::<PyArray1<PyObject>>()?;
@ -495,7 +496,7 @@ impl PyTokenizer {
e e
)) ))
})?; })?;
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) Ok(PyBytes::new(py, data.as_bytes()).into())
} }
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
@ -513,9 +514,12 @@ impl PyTokenizer {
} }
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> { fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<Bound<'p, PyTuple>> {
let model = PyModel::from(BPE::default()).into_py(py); let model: PyObject = PyModel::from(BPE::default())
PyTuple::new_bound(py, vec![model]) .into_pyobject(py)?
.into_any()
.into();
PyTuple::new(py, vec![model])
} }
/// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string. /// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
@ -594,14 +598,14 @@ impl PyTokenizer {
token: Option<String>, token: Option<String>,
) -> PyResult<Self> { ) -> PyResult<Self> {
let path = Python::with_gil(|py| -> PyResult<String> { let path = Python::with_gil(|py| -> PyResult<String> {
let huggingface_hub = PyModule::import_bound(py, intern!(py, "huggingface_hub"))?; let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?;
let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?; let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?;
let kwargs = [ let kwargs = [
(intern!(py, "repo_id"), identifier), (intern!(py, "repo_id"), identifier),
(intern!(py, "filename"), "tokenizer.json"), (intern!(py, "filename"), "tokenizer.json"),
(intern!(py, "revision"), &revision), (intern!(py, "revision"), &revision),
] ]
.into_py_dict_bound(py); .into_py_dict(py)?;
if let Some(token) = token { if let Some(token) = token {
kwargs.set_item(intern!(py, "token"), token)?; kwargs.set_item(intern!(py, "token"), token)?;
} }
@ -796,7 +800,7 @@ impl PyTokenizer {
#[getter] #[getter]
fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> { fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> {
self.tokenizer.get_truncation().map_or(Ok(None), |params| { self.tokenizer.get_truncation().map_or(Ok(None), |params| {
let dict = PyDict::new_bound(py); let dict = PyDict::new(py);
dict.set_item("max_length", params.max_length)?; dict.set_item("max_length", params.max_length)?;
dict.set_item("stride", params.stride)?; dict.set_item("stride", params.stride)?;
@ -906,7 +910,7 @@ impl PyTokenizer {
#[getter] #[getter]
fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> { fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> {
self.tokenizer.get_padding().map_or(Ok(None), |params| { self.tokenizer.get_padding().map_or(Ok(None), |params| {
let dict = PyDict::new_bound(py); let dict = PyDict::new(py);
dict.set_item( dict.set_item(
"length", "length",
@ -1342,7 +1346,7 @@ impl PyTokenizer {
if let Ok(s) = element.downcast::<PyString>() { if let Ok(s) = element.downcast::<PyString>() {
itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned()))) itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned())))
} else { } else {
match element.iter() { match element.try_iter() {
Ok(iter) => itertools::Either::Left( Ok(iter) => itertools::Either::Left(
iter.map(|i| i?.extract::<String>()) iter.map(|i| i?.extract::<String>())
.collect::<Vec<_>>() .collect::<Vec<_>>()

View File

@ -29,16 +29,22 @@ impl PyTrainer {
pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> { pub(crate) fn get_as_subtype(&self, py: Python<'_>) -> PyResult<PyObject> {
let base = self.clone(); let base = self.clone();
Ok(match *self.trainer.as_ref().read().unwrap() { Ok(match *self.trainer.as_ref().read().unwrap() {
TrainerWrapper::BpeTrainer(_) => Py::new(py, (PyBpeTrainer {}, base))?.into_py(py), TrainerWrapper::BpeTrainer(_) => Py::new(py, (PyBpeTrainer {}, base))?
TrainerWrapper::WordPieceTrainer(_) => { .into_pyobject(py)?
Py::new(py, (PyWordPieceTrainer {}, base))?.into_py(py) .into_any()
} .into(),
TrainerWrapper::WordLevelTrainer(_) => { TrainerWrapper::WordPieceTrainer(_) => Py::new(py, (PyWordPieceTrainer {}, base))?
Py::new(py, (PyWordLevelTrainer {}, base))?.into_py(py) .into_pyobject(py)?
} .into_any()
TrainerWrapper::UnigramTrainer(_) => { .into(),
Py::new(py, (PyUnigramTrainer {}, base))?.into_py(py) TrainerWrapper::WordLevelTrainer(_) => Py::new(py, (PyWordLevelTrainer {}, base))?
} .into_pyobject(py)?
.into_any()
.into(),
TrainerWrapper::UnigramTrainer(_) => Py::new(py, (PyUnigramTrainer {}, base))?
.into_pyobject(py)?
.into_any()
.into(),
}) })
} }
} }
@ -51,7 +57,7 @@ impl PyTrainer {
e e
)) ))
})?; })?;
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py)) Ok(PyBytes::new(py, data.as_bytes()).into())
} }
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {

View File

@ -65,8 +65,7 @@ where
pub fn new(iter: &Bound<'_, PyAny>, converter: F, buffer_size: usize) -> PyResult<Self> { pub fn new(iter: &Bound<'_, PyAny>, converter: F, buffer_size: usize) -> PyResult<Self> {
let py = iter.py(); let py = iter.py();
let iter: Py<PyAny> = unsafe { let iter: Py<PyAny> = unsafe {
Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))? Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))?.into()
.to_object(py)
}; };
Ok(Self { Ok(Self {

View File

@ -57,7 +57,7 @@ where
Ok(serializer.output) Ok(serializer.output)
} }
impl<'a> ser::Serializer for &'a mut Serializer { impl ser::Serializer for &mut Serializer {
// The output type produced by this `Serializer` during successful // The output type produced by this `Serializer` during successful
// serialization. Most serializers that produce text or binary output should // serialization. Most serializers that produce text or binary output should
// set `Ok = ()` and serialize into an `io::Write` or buffer contained // set `Ok = ()` and serialize into an `io::Write` or buffer contained
@ -355,7 +355,7 @@ impl<'a> ser::Serializer for &'a mut Serializer {
// //
// This impl is SerializeSeq so these methods are called after `serialize_seq` // This impl is SerializeSeq so these methods are called after `serialize_seq`
// is called on the Serializer. // is called on the Serializer.
impl<'a> ser::SerializeSeq for &'a mut Serializer { impl ser::SerializeSeq for &mut Serializer {
// Must match the `Ok` type of the serializer. // Must match the `Ok` type of the serializer.
type Ok = (); type Ok = ();
// Must match the `Error` type of the serializer. // Must match the `Error` type of the serializer.
@ -391,7 +391,7 @@ impl<'a> ser::SerializeSeq for &'a mut Serializer {
} }
// Same thing but for tuples. // Same thing but for tuples.
impl<'a> ser::SerializeTuple for &'a mut Serializer { impl ser::SerializeTuple for &mut Serializer {
type Ok = (); type Ok = ();
type Error = Error; type Error = Error;
@ -423,7 +423,7 @@ impl<'a> ser::SerializeTuple for &'a mut Serializer {
} }
// Same thing but for tuple structs. // Same thing but for tuple structs.
impl<'a> ser::SerializeTupleStruct for &'a mut Serializer { impl ser::SerializeTupleStruct for &mut Serializer {
type Ok = (); type Ok = ();
type Error = Error; type Error = Error;
@ -463,7 +463,7 @@ impl<'a> ser::SerializeTupleStruct for &'a mut Serializer {
// //
// So the `end` method in this impl is responsible for closing both the `]` and // So the `end` method in this impl is responsible for closing both the `]` and
// the `}`. // the `}`.
impl<'a> ser::SerializeTupleVariant for &'a mut Serializer { impl ser::SerializeTupleVariant for &mut Serializer {
type Ok = (); type Ok = ();
type Error = Error; type Error = Error;
@ -502,7 +502,7 @@ impl<'a> ser::SerializeTupleVariant for &'a mut Serializer {
// `serialize_entry` method allows serializers to optimize for the case where // `serialize_entry` method allows serializers to optimize for the case where
// key and value are both available simultaneously. In JSON it doesn't make a // key and value are both available simultaneously. In JSON it doesn't make a
// difference so the default behavior for `serialize_entry` is fine. // difference so the default behavior for `serialize_entry` is fine.
impl<'a> ser::SerializeMap for &'a mut Serializer { impl ser::SerializeMap for &mut Serializer {
type Ok = (); type Ok = ();
type Error = Error; type Error = Error;
@ -559,7 +559,7 @@ impl<'a> ser::SerializeMap for &'a mut Serializer {
// Structs are like maps in which the keys are constrained to be compile-time // Structs are like maps in which the keys are constrained to be compile-time
// constant strings. // constant strings.
impl<'a> ser::SerializeStruct for &'a mut Serializer { impl ser::SerializeStruct for &mut Serializer {
type Ok = (); type Ok = ();
type Error = Error; type Error = Error;
@ -590,7 +590,7 @@ impl<'a> ser::SerializeStruct for &'a mut Serializer {
// Similar to `SerializeTupleVariant`, here the `end` method is responsible for // Similar to `SerializeTupleVariant`, here the `end` method is responsible for
// closing both of the curly braces opened by `serialize_struct_variant`. // closing both of the curly braces opened by `serialize_struct_variant`.
impl<'a> ser::SerializeStructVariant for &'a mut Serializer { impl ser::SerializeStructVariant for &mut Serializer {
type Ok = (); type Ok = ();
type Error = Error; type Error = Error;

View File

@ -55,7 +55,7 @@ unicode-normalization-alignments = "0.1"
unicode_categories = "0.1" unicode_categories = "0.1"
unicode-segmentation = "1.11" unicode-segmentation = "1.11"
indicatif = {version = "0.17", optional = true} indicatif = {version = "0.17", optional = true}
itertools = "0.12" itertools = "0.13"
log = "0.4" log = "0.4"
derive_builder = "0.20" derive_builder = "0.20"
spm_precompiled = "0.1.3" spm_precompiled = "0.1.3"
@ -63,8 +63,8 @@ hf-hub = { version = "0.3.2", optional = true }
aho-corasick = "1.1" aho-corasick = "1.1"
paste = "1.0.14" paste = "1.0.14"
macro_rules_attribute = "0.2.0" macro_rules_attribute = "0.2.0"
thiserror = "1.0.49" thiserror = "2"
fancy-regex = { version = "0.13", optional = true} fancy-regex = { version = "0.14", optional = true}
getrandom = { version = "0.2.10" } getrandom = { version = "0.2.10" }
esaxx-rs = { version = "0.1.10", default-features = false, features=[]} esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
monostate = "0.1.12" monostate = "0.1.12"

View File

@ -28,7 +28,7 @@ impl<'a> OrderedVocabIter<'a> {
} }
} }
impl<'a> Serialize for OrderedVocabIter<'a> { impl Serialize for OrderedVocabIter<'_> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error> fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where where
S: Serializer, S: Serializer,

View File

@ -341,7 +341,7 @@ impl Encoding {
.step_by(offset) .step_by(offset)
.filter_map(|stop| { .filter_map(|stop| {
let stop = stop + 1; let stop = stop + 1;
let start = if stop < max_len { 0 } else { stop - max_len }; let start = stop.saturating_sub(max_len);
if start < stop && !end { if start < stop && !end {
end = start == 0; end = start == 0;
Some((start, stop)) Some((start, stop))

View File

@ -233,7 +233,7 @@ impl<'s> From<&'s [String]> for InputSequence<'s> {
} }
} }
impl<'s> From<Vec<String>> for InputSequence<'s> { impl From<Vec<String>> for InputSequence<'_> {
fn from(input: Vec<String>) -> Self { fn from(input: Vec<String>) -> Self {
Self::PreTokenizedOwned(Cow::Owned(input)) Self::PreTokenizedOwned(Cow::Owned(input))
} }

View File

@ -22,7 +22,7 @@ impl SysRegex {
pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>); pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>);
impl<'r, 't> Iterator for Matches<'r, 't> { impl Iterator for Matches<'_, '_> {
type Item = (usize, usize); type Item = (usize, usize);
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {