Upgrade pyo3 to 0.16 (#956)

* Upgrade pyo3 to 0.15

Rebase-conflicts-fixed-by: H. Vetinari <h.vetinari@gmx.com>

* Upgrade pyo3 to 0.16

Rebase-conflicts-fixed-by: H. Vetinari <h.vetinari@gmx.com>

* Install Python before running cargo clippy

* Fix clippy warnings

* Use `PyArray_Check` instead of downcasting to `PyArray1<u8>`

* Enable `auto-initialize` of pyo3 to fix `cargo test
--no-default-features`

* Fix some test cases

Why do they change?

* Refactor and add SAFETY comments to `PyArrayUnicode`

Replace deprecated `PyUnicode_FromUnicode` with `PyUnicode_FromKindAndData`

Co-authored-by: messense <messense@icloud.com>
This commit is contained in:
h-vetinari
2022-05-06 00:48:40 +11:00
committed by GitHub
parent 6533bf0fad
commit 519cc13be0
19 changed files with 620 additions and 620 deletions

View File

@ -15,7 +15,7 @@ use tokenizers as tk;
///
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
/// Trainer will return an instance of this class when instantiated.
#[pyclass(name=Trainer, module = "tokenizers.trainers", name=Trainer)]
#[pyclass(module = "tokenizers.trainers", name = "Trainer", subclass)]
#[derive(Clone, Deserialize, Serialize)]
pub struct PyTrainer {
#[serde(flatten)]
@ -164,7 +164,7 @@ macro_rules! setter {
///
/// end_of_word_suffix (:obj:`str`, `optional`):
/// A suffix to be used for every subword that is a end-of-word.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=BpeTrainer)]
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "BpeTrainer")]
pub struct PyBpeTrainer {}
#[pymethods]
impl PyBpeTrainer {
@ -367,8 +367,10 @@ impl PyBpeTrainer {
///
/// end_of_word_suffix (:obj:`str`, `optional`):
/// A suffix to be used for every subword that is a end-of-word.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordPieceTrainer)]
#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")]
#[pyo3(
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
)]
pub struct PyWordPieceTrainer {}
#[pymethods]
impl PyWordPieceTrainer {
@ -557,7 +559,7 @@ impl PyWordPieceTrainer {
///
/// special_tokens (:obj:`List[Union[str, AddedToken]]`):
/// A list of special tokens the model should know of.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordLevelTrainer)]
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordLevelTrainer")]
pub struct PyWordLevelTrainer {}
#[pymethods]
impl PyWordLevelTrainer {
@ -713,8 +715,10 @@ impl PyWordLevelTrainer {
/// n_sub_iterations (:obj:`int`):
/// The number of iterations of the EM algorithm to perform before
/// pruning the vocabulary.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=UnigramTrainer)]
#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"]
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")]
#[pyo3(
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
)]
pub struct PyUnigramTrainer {}
#[pymethods]
impl PyUnigramTrainer {
@ -864,8 +868,8 @@ mod tests {
let py_bpe = py_trainer.get_as_subtype().unwrap();
let gil = Python::acquire_gil();
assert_eq!(
"tokenizers.trainers.BpeTrainer",
py_bpe.as_ref(gil.python()).get_type().name()
"BpeTrainer",
py_bpe.as_ref(gil.python()).get_type().name().unwrap()
);
}
}