mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Upgrade pyo3 to 0.16 (#956)
* Upgrade pyo3 to 0.15 Rebase-conflicts-fixed-by: H. Vetinari <h.vetinari@gmx.com> * Upgrade pyo3 to 0.16 Rebase-conflicts-fixed-by: H. Vetinari <h.vetinari@gmx.com> * Install Python before running cargo clippy * Fix clippy warnings * Use `PyArray_Check` instead of downcasting to `PyArray1<u8>` * Enable `auto-initialize` of pyo3 to fix `cargo test --no-default-features` * Fix some test cases Why do they change? * Refactor and add SAFETY comments to `PyArrayUnicode` Replace deprecated `PyUnicode_FromUnicode` with `PyUnicode_FromKindAndData` Co-authored-by: messense <messense@icloud.com>
This commit is contained in:
@ -15,7 +15,7 @@ use tokenizers as tk;
|
||||
///
|
||||
/// This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
/// Trainer will return an instance of this class when instantiated.
|
||||
#[pyclass(name=Trainer, module = "tokenizers.trainers", name=Trainer)]
|
||||
#[pyclass(module = "tokenizers.trainers", name = "Trainer", subclass)]
|
||||
#[derive(Clone, Deserialize, Serialize)]
|
||||
pub struct PyTrainer {
|
||||
#[serde(flatten)]
|
||||
@ -164,7 +164,7 @@ macro_rules! setter {
|
||||
///
|
||||
/// end_of_word_suffix (:obj:`str`, `optional`):
|
||||
/// A suffix to be used for every subword that is a end-of-word.
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=BpeTrainer)]
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "BpeTrainer")]
|
||||
pub struct PyBpeTrainer {}
|
||||
#[pymethods]
|
||||
impl PyBpeTrainer {
|
||||
@ -367,8 +367,10 @@ impl PyBpeTrainer {
|
||||
///
|
||||
/// end_of_word_suffix (:obj:`str`, `optional`):
|
||||
/// A suffix to be used for every subword that is a end-of-word.
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordPieceTrainer)]
|
||||
#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"]
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")]
|
||||
#[pyo3(
|
||||
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
|
||||
)]
|
||||
pub struct PyWordPieceTrainer {}
|
||||
#[pymethods]
|
||||
impl PyWordPieceTrainer {
|
||||
@ -557,7 +559,7 @@ impl PyWordPieceTrainer {
|
||||
///
|
||||
/// special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
||||
/// A list of special tokens the model should know of.
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=WordLevelTrainer)]
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordLevelTrainer")]
|
||||
pub struct PyWordLevelTrainer {}
|
||||
#[pymethods]
|
||||
impl PyWordLevelTrainer {
|
||||
@ -713,8 +715,10 @@ impl PyWordLevelTrainer {
|
||||
/// n_sub_iterations (:obj:`int`):
|
||||
/// The number of iterations of the EM algorithm to perform before
|
||||
/// pruning the vocabulary.
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name=UnigramTrainer)]
|
||||
#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"]
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")]
|
||||
#[pyo3(
|
||||
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
|
||||
)]
|
||||
pub struct PyUnigramTrainer {}
|
||||
#[pymethods]
|
||||
impl PyUnigramTrainer {
|
||||
@ -864,8 +868,8 @@ mod tests {
|
||||
let py_bpe = py_trainer.get_as_subtype().unwrap();
|
||||
let gil = Python::acquire_gil();
|
||||
assert_eq!(
|
||||
"tokenizers.trainers.BpeTrainer",
|
||||
py_bpe.as_ref(gil.python()).get_type().name()
|
||||
"BpeTrainer",
|
||||
py_bpe.as_ref(gil.python()).get_type().name().unwrap()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user