diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index a92d7362..45e040ce 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -10,12 +10,12 @@ crate-type = ["cdylib"] [dependencies] rayon = "1.10" -serde = { version = "1.0", features = [ "rc", "derive" ]} +serde = { version = "1.0", features = ["rc", "derive"] } serde_json = "1.0" libc = "0.2" env_logger = "0.11" -pyo3 = { version = "0.21" } -numpy = "0.21" +pyo3 = { version = "0.22", features = ["py-clone"] } +numpy = "0.22" ndarray = "0.15" itertools = "0.12" @@ -24,7 +24,7 @@ path = "../../tokenizers" [dev-dependencies] tempfile = "3.10" -pyo3 = { version = "0.21", features = ["auto-initialize"] } +pyo3 = { version = "0.22", features = ["auto-initialize", "py-clone"] } [features] defaut = ["pyo3/extension-module"] diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 1a03a772..ab4ac066 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -88,9 +88,9 @@ impl PyDecoder { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyBytes>(py) { + match state.extract::<&[u8]>(py) { Ok(s) => { - self.decoder = serde_json::from_slice(s.as_bytes()).map_err(|e| { + self.decoder = serde_json::from_slice(s).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle Decoder: {}", e diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs index 3ca48d56..9d7fbde7 100644 --- a/bindings/python/src/encoding.rs +++ b/bindings/python/src/encoding.rs @@ -41,9 +41,9 @@ impl PyEncoding { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyBytes>(py) { + match state.extract::<&[u8]>(py) { Ok(s) => { - self.encoding = serde_json::from_slice(s.as_bytes()).map_err(|e| { + self.encoding = serde_json::from_slice(s).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle Encoding: {}", e diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 424be9f5..91b8fe1b 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -109,9 +109,9 @@ impl PyModel { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyBytes>(py) { + match state.extract::<&[u8]>(py) { Ok(s) => { - self.model = serde_json::from_slice(s.as_bytes()).map_err(|e| { + self.model = serde_json::from_slice(s).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle Model: {}", e @@ -181,7 +181,7 @@ impl PyModel { /// /// Returns: /// :obj:`List[str]`: The list of saved files - #[pyo3(text_signature = "(self, folder, prefix)")] + #[pyo3(signature = (folder, prefix=None, name=None), text_signature = "(self, folder, prefix)")] fn save<'a>( &self, py: Python<'_>, @@ -835,7 +835,7 @@ pub struct PyUnigram {} #[pymethods] impl PyUnigram { #[new] - #[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")] + #[pyo3(signature = (vocab=None, unk_id=None, byte_fallback=None), text_signature = "(self, vocab, unk_id, byte_fallback)")] fn new( vocab: Option>, unk_id: Option, diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index feff9811..7b592690 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -118,9 +118,9 @@ impl PyNormalizer { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyBytes>(py) { + match state.extract::<&[u8]>(py) { Ok(s) => { - self.normalizer = serde_json::from_slice(s.as_bytes()).map_err(|e| { + self.normalizer = serde_json::from_slice(s).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle Normalizer: {}", e diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 1c43f7eb..92a40fc7 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -122,9 +122,9 @@ impl PyPreTokenizer { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyBytes>(py) { + match state.extract::<&[u8]>(py) { Ok(s) => { - let unpickled = serde_json::from_slice(s.as_bytes()).map_err(|e| { + let unpickled = serde_json::from_slice(s).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle PreTokenizer: {}", e diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 1d8e8dfa..474f1e8f 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -82,9 +82,9 @@ impl PyPostProcessor { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyBytes>(py) { + match state.extract::<&[u8]>(py) { Ok(s) => { - self.processor = serde_json::from_slice(s.as_bytes()).map_err(|e| { + self.processor = serde_json::from_slice(s).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle PostProcessor: {}", e @@ -272,7 +272,7 @@ impl From for SpecialToken { } impl FromPyObject<'_> for PySpecialToken { - fn extract(ob: &PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult { if let Ok(v) = ob.extract::<(String, u32)>() { Ok(Self(v.into())) } else if let Ok(v) = ob.extract::<(u32, String)>() { @@ -312,7 +312,7 @@ impl From for Template { } impl FromPyObject<'_> for PyTemplate { - fn extract(ob: &PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult { if let Ok(s) = ob.extract::<&str>() { Ok(Self( s.try_into().map_err(exceptions::PyValueError::new_err)?, diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 00dacf41..7fd03ae8 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -2,7 +2,7 @@ use serde::Serialize; use std::collections::{hash_map::DefaultHasher, HashMap}; use std::hash::{Hash, Hasher}; -use numpy::{npyffi, PyArray1}; +use numpy::{npyffi, PyArray1, PyArrayMethods}; use pyo3::class::basic::CompareOp; use pyo3::exceptions; use pyo3::intern; @@ -156,7 +156,7 @@ impl PyAddedToken { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyDict>(py) { + match state.downcast_bound::(py) { Ok(state) => { for (key, value) in state { let key: &str = key.extract()?; @@ -172,7 +172,7 @@ impl PyAddedToken { } Ok(()) } - Err(e) => Err(e), + Err(e) => Err(e.into()), } } @@ -263,10 +263,10 @@ impl PyAddedToken { struct TextInputSequence<'s>(tk::InputSequence<'s>); impl<'s> FromPyObject<'s> for TextInputSequence<'s> { - fn extract(ob: &'s PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'s, PyAny>) -> PyResult { let err = exceptions::PyTypeError::new_err("TextInputSequence must be str"); - if let Ok(s) = ob.downcast::() { - Ok(Self(s.to_string_lossy().into())) + if let Ok(s) = ob.extract::() { + Ok(Self(s.into())) } else { Err(err) } @@ -280,7 +280,7 @@ impl<'s> From> for tk::InputSequence<'s> { struct PyArrayUnicode(Vec); impl FromPyObject<'_> for PyArrayUnicode { - fn extract(ob: &PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult { // SAFETY Making sure the pointer is a valid numpy array requires calling numpy C code if unsafe { npyffi::PyArray_Check(ob.py(), ob.as_ptr()) } == 0 { return Err(exceptions::PyTypeError::new_err("Expected an np.array")); @@ -291,8 +291,8 @@ impl FromPyObject<'_> for PyArrayUnicode { let desc = (*arr).descr; ( (*desc).type_num, - (*desc).elsize as usize, - (*desc).alignment as usize, + npyffi::PyDataType_ELSIZE(ob.py(), desc) as usize, + npyffi::PyDataType_ALIGNMENT(ob.py(), desc) as usize, (*arr).data, (*arr).nd, (*arr).flags, @@ -347,7 +347,7 @@ impl From for tk::InputSequence<'_> { struct PyArrayStr(Vec); impl FromPyObject<'_> for PyArrayStr { - fn extract(ob: &PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult { let array = ob.downcast::>()?; let seq = array .readonly() @@ -370,7 +370,7 @@ impl From for tk::InputSequence<'_> { struct PreTokenizedInputSequence<'s>(tk::InputSequence<'s>); impl<'s> FromPyObject<'s> for PreTokenizedInputSequence<'s> { - fn extract(ob: &'s PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'s, PyAny>) -> PyResult { if let Ok(seq) = ob.extract::() { return Ok(Self(seq.into())); } @@ -400,17 +400,17 @@ impl<'s> From> for tk::InputSequence<'s> { struct TextEncodeInput<'s>(tk::EncodeInput<'s>); impl<'s> FromPyObject<'s> for TextEncodeInput<'s> { - fn extract(ob: &'s PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'s, PyAny>) -> PyResult { if let Ok(i) = ob.extract::() { return Ok(Self(i.into())); } if let Ok((i1, i2)) = ob.extract::<(TextInputSequence, TextInputSequence)>() { return Ok(Self((i1, i2).into())); } - if let Ok(arr) = ob.extract::>() { + if let Ok(arr) = ob.downcast::() { if arr.len() == 2 { - let first = arr[0].extract::()?; - let second = arr[1].extract::()?; + let first = arr.get_item(0)?.extract::()?; + let second = arr.get_item(1)?.extract::()?; return Ok(Self((first, second).into())); } } @@ -426,7 +426,7 @@ impl<'s> From> for tk::tokenizer::EncodeInput<'s> { } struct PreTokenizedEncodeInput<'s>(tk::EncodeInput<'s>); impl<'s> FromPyObject<'s> for PreTokenizedEncodeInput<'s> { - fn extract(ob: &'s PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'s, PyAny>) -> PyResult { if let Ok(i) = ob.extract::() { return Ok(Self(i.into())); } @@ -434,10 +434,10 @@ impl<'s> FromPyObject<'s> for PreTokenizedEncodeInput<'s> { { return Ok(Self((i1, i2).into())); } - if let Ok(arr) = ob.extract::>() { + if let Ok(arr) = ob.downcast::() { if arr.len() == 2 { - let first = arr[0].extract::()?; - let second = arr[1].extract::()?; + let first = arr.get_item(0)?.extract::()?; + let second = arr.get_item(1)?.extract::()?; return Ok(Self((first, second).into())); } } @@ -498,9 +498,9 @@ impl PyTokenizer { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyBytes>(py) { + match state.extract::<&[u8]>(py) { Ok(s) => { - self.tokenizer = serde_json::from_slice(s.as_bytes()).map_err(|e| { + self.tokenizer = serde_json::from_slice(s).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle Tokenizer: {}", e @@ -1030,7 +1030,7 @@ impl PyTokenizer { fn encode_batch( &self, py: Python<'_>, - input: Vec<&PyAny>, + input: Bound<'_, PyList>, is_pretokenized: bool, add_special_tokens: bool, ) -> PyResult> { @@ -1091,7 +1091,7 @@ impl PyTokenizer { fn encode_batch_fast( &self, py: Python<'_>, - input: Vec<&PyAny>, + input: Bound<'_, PyList>, is_pretokenized: bool, add_special_tokens: bool, ) -> PyResult> { diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index c7144229..d4c7e615 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -55,9 +55,9 @@ impl PyTrainer { } fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> { - match state.extract::<&PyBytes>(py) { + match state.extract::<&[u8]>(py) { Ok(s) => { - let unpickled = serde_json::from_slice(s.as_bytes()).map_err(|e| { + let unpickled = serde_json::from_slice(s).map_err(|e| { exceptions::PyException::new_err(format!( "Error while attempting to unpickle PyTrainer: {}", e diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index b67dcff9..107d0a27 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -60,7 +60,7 @@ pub enum PyRange<'s> { #[pyo3(annotation = "Tuple[uint, uint]")] Range(usize, usize), #[pyo3(annotation = "slice")] - Slice(&'s PySlice), + Slice(Bound<'s, PySlice>), } impl PyRange<'_> { pub fn to_range(&self, max_len: usize) -> PyResult> { @@ -83,7 +83,7 @@ impl PyRange<'_> { } PyRange::Range(s, e) => Ok(*s..*e), PyRange::Slice(s) => { - let r = s.indices(max_len as std::os::raw::c_long)?; + let r = s.indices(max_len.try_into()?)?; Ok(r.start as usize..r.stop as usize) } } @@ -94,7 +94,7 @@ impl PyRange<'_> { pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior); impl FromPyObject<'_> for PySplitDelimiterBehavior { - fn extract(obj: &PyAny) -> PyResult { + fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { let s = obj.extract::<&str>()?; Ok(Self(match s { diff --git a/bindings/python/src/utils/pretokenization.rs b/bindings/python/src/utils/pretokenization.rs index 70444aac..a9879ad9 100644 --- a/bindings/python/src/utils/pretokenization.rs +++ b/bindings/python/src/utils/pretokenization.rs @@ -56,7 +56,7 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResul ToPyResult(pretok.tokenize(|normalized| { let output = func.call((normalized.get(),), None)?; Ok(output - .extract::<&PyList>()? + .extract::>()? .into_iter() .map(|obj| Ok(Token::from(obj.extract::()?))) .collect::>>()?) @@ -69,7 +69,7 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResul #[derive(Clone)] pub struct PyOffsetReferential(OffsetReferential); impl FromPyObject<'_> for PyOffsetReferential { - fn extract(obj: &PyAny) -> PyResult { + fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { let s = obj.extract::<&str>()?; Ok(Self(match s { @@ -85,7 +85,7 @@ impl FromPyObject<'_> for PyOffsetReferential { #[derive(Clone)] pub struct PyOffsetType(OffsetType); impl FromPyObject<'_> for PyOffsetType { - fn extract(obj: &PyAny) -> PyResult { + fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult { let s = obj.extract::<&str>()?; Ok(Self(match s {