mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Upgrade pyo3 to 0.16 (#956)
* Upgrade pyo3 to 0.15 Rebase-conflicts-fixed-by: H. Vetinari <h.vetinari@gmx.com> * Upgrade pyo3 to 0.16 Rebase-conflicts-fixed-by: H. Vetinari <h.vetinari@gmx.com> * Install Python before running cargo clippy * Fix clippy warnings * Use `PyArray_Check` instead of downcasting to `PyArray1<u8>` * Enable `auto-initialize` of pyo3 to fix `cargo test --no-default-features` * Fix some test cases Why do they change? * Refactor and add SAFETY comments to `PyArrayUnicode` Replace deprecated `PyUnicode_FromUnicode` with `PyUnicode_FromKindAndData` Co-authored-by: messense <messense@icloud.com>
This commit is contained in:
@ -3,7 +3,6 @@ use std::sync::{Arc, RwLock};
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use pyo3::PySequenceProtocol;
|
||||
|
||||
use crate::error::ToPyResult;
|
||||
use crate::utils::{PyNormalizedString, PyNormalizedStringRefMut, PyPattern};
|
||||
@ -43,7 +42,7 @@ impl PyNormalizedStringMut<'_> {
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
/// Normalizer will return an instance of this class when instantiated.
|
||||
#[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)]
|
||||
#[pyclass(dict, module = "tokenizers.normalizers", name = "Normalizer", subclass)]
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
pub struct PyNormalizer {
|
||||
#[serde(flatten)]
|
||||
@ -144,7 +143,7 @@ impl PyNormalizer {
|
||||
/// normalized (:class:`~tokenizers.NormalizedString`):
|
||||
/// The normalized string on which to apply this
|
||||
/// :class:`~tokenizers.normalizers.Normalizer`
|
||||
#[text_signature = "(self, normalized)"]
|
||||
#[pyo3(text_signature = "(self, normalized)")]
|
||||
fn normalize(&self, mut normalized: PyNormalizedStringMut) -> PyResult<()> {
|
||||
normalized.normalize_with(&self.normalizer)
|
||||
}
|
||||
@ -162,7 +161,7 @@ impl PyNormalizer {
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`str`: A string after normalization
|
||||
#[text_signature = "(self, sequence)"]
|
||||
#[pyo3(text_signature = "(self, sequence)")]
|
||||
fn normalize_str(&self, sequence: &str) -> PyResult<String> {
|
||||
let mut normalized = NormalizedString::from(sequence);
|
||||
ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?;
|
||||
@ -217,8 +216,10 @@ macro_rules! setter {
|
||||
///
|
||||
/// lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
/// Whether to lowercase.
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)]
|
||||
#[text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")]
|
||||
#[pyo3(
|
||||
text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"
|
||||
)]
|
||||
pub struct PyBertNormalizer {}
|
||||
#[pymethods]
|
||||
impl PyBertNormalizer {
|
||||
@ -287,8 +288,8 @@ impl PyBertNormalizer {
|
||||
}
|
||||
|
||||
/// NFD Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFD)]
|
||||
#[text_signature = "(self)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFD")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNFD {}
|
||||
#[pymethods]
|
||||
impl PyNFD {
|
||||
@ -299,8 +300,8 @@ impl PyNFD {
|
||||
}
|
||||
|
||||
/// NFKD Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKD)]
|
||||
#[text_signature = "(self)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKD")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNFKD {}
|
||||
#[pymethods]
|
||||
impl PyNFKD {
|
||||
@ -311,8 +312,8 @@ impl PyNFKD {
|
||||
}
|
||||
|
||||
/// NFC Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFC)]
|
||||
#[text_signature = "(self)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFC")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNFC {}
|
||||
#[pymethods]
|
||||
impl PyNFC {
|
||||
@ -323,8 +324,8 @@ impl PyNFC {
|
||||
}
|
||||
|
||||
/// NFKC Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKC)]
|
||||
#[text_signature = "(self)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKC")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNFKC {}
|
||||
#[pymethods]
|
||||
impl PyNFKC {
|
||||
@ -340,7 +341,7 @@ impl PyNFKC {
|
||||
/// Args:
|
||||
/// normalizers (:obj:`List[Normalizer]`):
|
||||
/// A list of Normalizer to be run as a sequence
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Sequence")]
|
||||
pub struct PySequence {}
|
||||
#[pymethods]
|
||||
impl PySequence {
|
||||
@ -363,18 +364,15 @@ impl PySequence {
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[PyList::empty(py)])
|
||||
}
|
||||
}
|
||||
|
||||
#[pyproto]
|
||||
impl PySequenceProtocol for PySequence {
|
||||
fn __len__(&self) -> usize {
|
||||
0
|
||||
}
|
||||
}
|
||||
|
||||
/// Lowercase Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Lowercase)]
|
||||
#[text_signature = "(self)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Lowercase")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyLowercase {}
|
||||
#[pymethods]
|
||||
impl PyLowercase {
|
||||
@ -385,8 +383,8 @@ impl PyLowercase {
|
||||
}
|
||||
|
||||
/// Strip normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Strip)]
|
||||
#[text_signature = "(self, left=True, right=True)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Strip")]
|
||||
#[pyo3(text_signature = "(self, left=True, right=True)")]
|
||||
pub struct PyStrip {}
|
||||
#[pymethods]
|
||||
impl PyStrip {
|
||||
@ -418,8 +416,8 @@ impl PyStrip {
|
||||
}
|
||||
|
||||
/// StripAccents normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)]
|
||||
#[text_signature = "(self)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyStripAccents {}
|
||||
#[pymethods]
|
||||
impl PyStripAccents {
|
||||
@ -430,8 +428,8 @@ impl PyStripAccents {
|
||||
}
|
||||
|
||||
/// Nmt normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)]
|
||||
#[text_signature = "(self)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Nmt")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNmt {}
|
||||
#[pymethods]
|
||||
impl PyNmt {
|
||||
@ -443,8 +441,8 @@ impl PyNmt {
|
||||
|
||||
/// Precompiled normalizer
|
||||
/// Don't use manually it is used for compatiblity for SentencePiece.
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)]
|
||||
#[text_signature = "(self, precompiled_charsmap)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
|
||||
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
|
||||
pub struct PyPrecompiled {}
|
||||
#[pymethods]
|
||||
impl PyPrecompiled {
|
||||
@ -466,8 +464,8 @@ impl PyPrecompiled {
|
||||
}
|
||||
|
||||
/// Replace normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)]
|
||||
#[text_signature = "(self, pattern, content)"]
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Replace")]
|
||||
#[pyo3(text_signature = "(self, pattern, content)")]
|
||||
pub struct PyReplace {}
|
||||
#[pymethods]
|
||||
impl PyReplace {
|
||||
@ -630,8 +628,8 @@ mod test {
|
||||
let py_nfc = py_norm.get_as_subtype().unwrap();
|
||||
let gil = Python::acquire_gil();
|
||||
assert_eq!(
|
||||
"tokenizers.normalizers.NFC",
|
||||
py_nfc.as_ref(gil.python()).get_type().name()
|
||||
"NFC",
|
||||
py_nfc.as_ref(gil.python()).get_type().name().unwrap()
|
||||
);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user