From 5c18ec5ff5088a1d9eef77086321f94052cf7af0 Mon Sep 17 00:00:00 2001
From: mert-kurttutan <kurttutan.mert@gmail.com>
Date: Wed, 8 Mar 2023 11:27:47 +0100
Subject: [PATCH] pyo3 v0.18 migration (#1173)

* pyo v0.18 migration

* Fix formatting issues of black
---
 bindings/python/Cargo.toml                    |  6 +-
 bindings/python/examples/custom_components.py |  2 +-
 .../python/examples/train_with_datasets.py    |  1 +
 .../implementations/bert_wordpiece.py         |  1 -
 .../py_src/tokenizers/models/__init__.pyi     | 55 +++++++++++++++++++
 bindings/python/src/decoders.rs               | 21 +++----
 bindings/python/src/encoding.rs               | 15 +++--
 bindings/python/src/models.rs                 | 13 +++--
 bindings/python/src/normalizers.rs            | 14 ++---
 bindings/python/src/pre_tokenizers.rs         | 10 ++--
 bindings/python/src/processors.rs             | 10 ++--
 bindings/python/src/tokenizer.rs              | 34 ++++++------
 bindings/python/src/trainers.rs               | 16 +++---
 bindings/python/src/utils/pretokenization.rs  | 20 +++----
 bindings/python/tests/test_serialization.py   |  2 +-
 15 files changed, 138 insertions(+), 82 deletions(-)

diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
index 86b1b036..5f7c6c55 100644
--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
 serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.7.1"
-pyo3 = "0.17.2"
-numpy = "0.17.2"
+pyo3 = "0.18.1"
+numpy = "0.18.0"
 ndarray = "0.13"
 onig = { version = "6.0", default-features = false }
 itertools = "0.9"
@@ -26,7 +26,7 @@ path = "../../tokenizers"
 
 [dev-dependencies]
 tempfile = "3.1"
-pyo3 = { version = "0.17.2", features = ["auto-initialize"] }
+pyo3 = { version = "0.18.1", features = ["auto-initialize"] }
 
 [features]
 default = ["pyo3/extension-module"]
diff --git a/bindings/python/examples/custom_components.py b/bindings/python/examples/custom_components.py
index 0570ec2b..cdb97309 100644
--- a/bindings/python/examples/custom_components.py
+++ b/bindings/python/examples/custom_components.py
@@ -24,7 +24,7 @@ class JiebaPreTokenizer:
         # Just an odd example...
         splits = []
         last = 0
-        for (i, char) in enumerate(str(normalized_string)):
+        for i, char in enumerate(str(normalized_string)):
             if char.isnumeric() and int(char) % 2 == 1:
                 splits.append(normalized_string[last:i])
                 last = i
diff --git a/bindings/python/examples/train_with_datasets.py b/bindings/python/examples/train_with_datasets.py
index e169b502..7f95ccd2 100644
--- a/bindings/python/examples/train_with_datasets.py
+++ b/bindings/python/examples/train_with_datasets.py
@@ -11,6 +11,7 @@ bpe_tokenizer.normalizer = normalizers.Lowercase()
 # Initialize a dataset
 dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1")
 
+
 # Build an iterator over this dataset
 def batch_iterator():
     batch_length = 1000
diff --git a/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py b/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
index c7fe6762..1f34e3ca 100644
--- a/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
+++ b/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py
@@ -26,7 +26,6 @@ class BertWordPieceTokenizer(BaseTokenizer):
         lowercase: bool = True,
         wordpieces_prefix: str = "##",
     ):
-
         if vocab is not None:
             tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
         else:
diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi
index a471ba81..d75c7c7e 100644
--- a/bindings/python/py_src/tokenizers/models/__init__.pyi
+++ b/bindings/python/py_src/tokenizers/models/__init__.pyi
@@ -9,6 +9,17 @@ class Model:
     This class cannot be constructed directly. Please use one of the concrete models.
     """
 
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
     def id_to_token(self, id):
         """
         Get the token associated to an ID
@@ -134,6 +145,17 @@ class BPE(Model):
             :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
         """
         pass
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
     def id_to_token(self, id):
         """
         Get the token associated to an ID
@@ -222,6 +244,17 @@ class Unigram(Model):
 
     def __init__(self, vocab):
         pass
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
     def id_to_token(self, id):
         """
         Get the token associated to an ID
@@ -316,6 +349,17 @@ class WordLevel(Model):
             :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
         """
         pass
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
     def id_to_token(self, id):
         """
         Get the token associated to an ID
@@ -428,6 +472,17 @@ class WordPiece(Model):
             :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
         """
         pass
+    def get_trainer(self):
+        """
+        Get the associated :class:`~tokenizers.trainers.Trainer`
+
+        Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
+        :class:`~tokenizers.models.Model`.
+
+        Returns:
+            :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+        """
+        pass
     def id_to_token(self, id):
         """
         Get the token associated to an ID
diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
index b6f8c031..5c3e4487 100644
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -149,7 +149,8 @@ pub struct PyByteLevelDec {}
 #[pymethods]
 impl PyByteLevelDec {
     #[new]
-    fn new() -> (Self, PyDecoder) {
+    #[pyo3(signature = (**_kwargs))]
+    fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) {
         (PyByteLevelDec {}, ByteLevel::default().into())
     }
 }
@@ -189,7 +190,7 @@ impl PyWordPieceDec {
     }
 
     #[new]
-    #[args(prefix = "String::from(\"##\")", cleanup = "true")]
+    #[pyo3(signature = (prefix = String::from("##"), cleanup = true))]
     fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) {
         (PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into())
     }
@@ -231,7 +232,7 @@ impl PyMetaspaceDec {
     }
 
     #[new]
-    #[args(replacement = "PyChar('▁')", add_prefix_space = "true")]
+    #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true))]
     fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
         (
             PyMetaspaceDec {},
@@ -262,7 +263,7 @@ impl PyBPEDecoder {
     }
 
     #[new]
-    #[args(suffix = "String::from(\"</w>\")")]
+    #[pyo3(signature = (suffix = String::from("</w>")))]
     fn new(suffix: String) -> (Self, PyDecoder) {
         (PyBPEDecoder {}, BPEDecoder::new(suffix).into())
     }
@@ -314,11 +315,11 @@ impl PyCTCDecoder {
     }
 
     #[new]
-    #[args(
-        pad_token = "String::from(\"<pad>\")",
-        word_delimiter_token = "String::from(\"|\")",
-        cleanup = "true"
-    )]
+    #[pyo3(signature = (
+        pad_token = String::from("<pad>"),
+        word_delimiter_token = String::from("|"),
+        cleanup = true
+    ))]
     fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) {
         (
             PyCTCDecoder {},
@@ -338,7 +339,7 @@ pub struct PySequenceDecoder {}
 #[pymethods]
 impl PySequenceDecoder {
     #[new]
-    #[args(decoders)]
+    #[pyo3(signature = (decoders_py))]
     fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
         let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
         for decoder_py in decoders_py.iter() {
diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs
index 1b5c0b85..4cbd65b8 100644
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@@ -78,7 +78,7 @@ impl PyEncoding {
     /// Returns:
     ///     :class:`~tokenizers.Encoding`: The resulting Encoding
     #[staticmethod]
-    #[args(growing_offsets = true)]
+    #[pyo3(signature = (encodings, growing_offsets = true))]
     #[pyo3(text_signature = "(encodings, growing_offsets=True)")]
     fn merge(encodings: Vec<PyRef<PyEncoding>>, growing_offsets: bool) -> PyEncoding {
         tk::tokenizer::Encoding::merge(
@@ -263,7 +263,7 @@ impl PyEncoding {
     ///
     /// Returns:
     ///     :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
-    #[args(sequence_index = 0)]
+    #[pyo3(signature = (word_index, sequence_index = 0))]
     #[pyo3(text_signature = "(self, word_index, sequence_index=0)")]
     fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
         self.encoding.word_to_tokens(word_index, sequence_index)
@@ -279,7 +279,7 @@ impl PyEncoding {
     ///
     /// Returns:
     ///     :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
-    #[args(sequence_index = 0)]
+    #[pyo3(signature = (word_index, sequence_index = 0))]
     #[pyo3(text_signature = "(self, word_index, sequence_index=0)")]
     fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
         self.encoding.word_to_chars(word_index, sequence_index)
@@ -347,7 +347,7 @@ impl PyEncoding {
     ///
     /// Returns:
     ///     :obj:`int`: The index of the token that contains this char in the encoded sequence
-    #[args(sequence_index = 0)]
+    #[pyo3(signature = (char_pos, sequence_index = 0))]
     #[pyo3(text_signature = "(self, char_pos, sequence_index=0)")]
     fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
         self.encoding.char_to_token(char_pos, sequence_index)
@@ -363,7 +363,7 @@ impl PyEncoding {
     ///
     /// Returns:
     ///     :obj:`int`: The index of the word that contains this char in the input sequence
-    #[args(sequence_index = 0)]
+    #[pyo3(signature = (char_pos, sequence_index = 0))]
     #[pyo3(text_signature = "(self, char_pos, sequence_index=0)")]
     fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
         self.encoding.char_to_word(char_pos, sequence_index)
@@ -386,7 +386,7 @@ impl PyEncoding {
     ///
     ///     pad_token (:obj:`str`, defaults to `[PAD]`):
     ///         The pad token to use
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (length, **kwargs))]
     #[pyo3(
         text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"
     )]
@@ -439,8 +439,7 @@ impl PyEncoding {
     ///
     ///     direction (:obj:`str`, defaults to :obj:`right`):
     ///         Truncate direction
-    #[args(stride = "0")]
-    #[args(direction = "\"right\"")]
+    #[pyo3(signature = (max_length, stride = 0, direction = "right"))]
     #[pyo3(text_signature = "(self, max_length, stride=0, direction='right')")]
     fn truncate(&mut self, max_length: usize, stride: usize, direction: &str) -> PyResult<()> {
         let tdir = match direction {
diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs
index c1848ed7..22a518dd 100644
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@@ -215,6 +215,7 @@ impl PyModel {
     ///
     /// Returns:
     ///     :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
+    #[pyo3(text_signature = "(self)")]
     fn get_trainer(&self, py: Python<'_>) -> PyResult<PyObject> {
         PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype(py)
     }
@@ -385,7 +386,7 @@ impl PyBPE {
     }
 
     #[new]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (vocab=None, merges=None, **kwargs))]
     fn new(
         py: Python<'_>,
         vocab: Option<PyVocab>,
@@ -472,7 +473,7 @@ impl PyBPE {
     /// Returns:
     ///     :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
     #[classmethod]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (vocab, merges, **kwargs))]
     #[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")]
     fn from_file(
         _cls: &PyType,
@@ -582,7 +583,7 @@ impl PyWordPiece {
     }
 
     #[new]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (vocab=None, **kwargs))]
     fn new(
         py: Python<'_>,
         vocab: Option<PyVocab>,
@@ -648,7 +649,7 @@ impl PyWordPiece {
     /// Returns:
     ///     :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
     #[classmethod]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (vocab, **kwargs))]
     #[pyo3(text_signature = "(vocab, **kwargs)")]
     fn from_file(
         _cls: &PyType,
@@ -693,7 +694,7 @@ impl PyWordLevel {
     }
 
     #[new]
-    #[args(unk_token = "None")]
+    #[pyo3(signature = (vocab=None, unk_token = None))]
     fn new(
         py: Python<'_>,
         vocab: Option<PyVocab>,
@@ -768,7 +769,7 @@ impl PyWordLevel {
     /// Returns:
     ///     :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
     #[classmethod]
-    #[args(unk_token = "None")]
+    #[pyo3(signature = (vocab, unk_token = None))]
     #[pyo3(text_signature = "(vocab, unk_token)")]
     fn from_file(
         _cls: &PyType,
diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs
index 956f865b..d825482d 100644
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -267,12 +267,12 @@ impl PyBertNormalizer {
     }
 
     #[new]
-    #[args(
-        clean_text = "true",
-        handle_chinese_chars = "true",
-        strip_accents = "None",
-        lowercase = "true"
-    )]
+    #[pyo3(signature = (
+        clean_text = true,
+        handle_chinese_chars = true,
+        strip_accents = None,
+        lowercase = true
+    ))]
     fn new(
         clean_text: bool,
         handle_chinese_chars: bool,
@@ -407,7 +407,7 @@ impl PyStrip {
     }
 
     #[new]
-    #[args(left = "true", right = "true")]
+    #[pyo3(signature = (left = true, right = true))]
     fn new(left: bool, right: bool) -> (Self, PyNormalizer) {
         (PyStrip {}, Strip::new(left, right).into())
     }
diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs
index 71f05d7e..18af23d5 100644
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -260,7 +260,7 @@ impl PyByteLevel {
     }
 
     #[new]
-    #[args(add_prefix_space = "true", use_regex = "true", _kwargs = "**")]
+    #[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs))]
     fn new(
         add_prefix_space: bool,
         use_regex: bool,
@@ -340,7 +340,7 @@ pub struct PySplit {}
 #[pymethods]
 impl PySplit {
     #[new]
-    #[args(invert = false)]
+    #[pyo3(signature = (pattern, behavior, invert = false))]
     fn new(
         pattern: PyPattern,
         behavior: PySplitDelimiterBehavior,
@@ -419,7 +419,7 @@ pub struct PyPunctuation {}
 #[pymethods]
 impl PyPunctuation {
     #[new]
-    #[args(behavior = "PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)")]
+    #[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)))]
     fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
         (PyPunctuation {}, Punctuation::new(behavior.into()).into())
     }
@@ -493,7 +493,7 @@ impl PyMetaspace {
     }
 
     #[new]
-    #[args(replacement = "PyChar('▁')", add_prefix_space = "true", _kwargs = "**")]
+    #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs))]
     fn new(
         replacement: PyChar,
         add_prefix_space: bool,
@@ -533,7 +533,7 @@ impl PyDigits {
     }
 
     #[new]
-    #[args(individual_digits = false)]
+    #[pyo3(signature = (individual_digits = false))]
     fn new(individual_digits: bool) -> (Self, PyPreTokenizer) {
         (PyDigits {}, Digits::new(individual_digits).into())
     }
diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs
index 442fa034..3a2cbdde 100644
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -123,7 +123,7 @@ impl PyPostProcessor {
     ///
     /// Return:
     ///     :class:`~tokenizers.Encoding`: The final encoding
-    #[args(pair = "None", add_special_tokens = "true")]
+    #[pyo3(signature = (encoding, pair = None, add_special_tokens = true))]
     #[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")]
     fn process(
         &self,
@@ -201,7 +201,7 @@ pub struct PyRobertaProcessing {}
 #[pymethods]
 impl PyRobertaProcessing {
     #[new]
-    #[args(trim_offsets = true, add_prefix_space = true)]
+    #[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true))]
     fn new(
         sep: (String, u32),
         cls: (String, u32),
@@ -236,7 +236,7 @@ pub struct PyByteLevel {}
 #[pymethods]
 impl PyByteLevel {
     #[new]
-    #[args(trim_offsets = "None", _kwargs = "**")]
+    #[pyo3(signature = (trim_offsets = None, **_kwargs))]
     fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
         let mut byte_level = ByteLevel::default();
 
@@ -388,7 +388,7 @@ pub struct PyTemplateProcessing {}
 #[pymethods]
 impl PyTemplateProcessing {
     #[new]
-    #[args(single = "None", pair = "None", special_tokens = "None")]
+    #[pyo3(signature = (single = None, pair = None, special_tokens = None))]
     fn new(
         single: Option<PyTemplate>,
         pair: Option<PyTemplate>,
@@ -427,7 +427,7 @@ pub struct PySequence {}
 #[pymethods]
 impl PySequence {
     #[new]
-    #[args(processors)]
+    #[pyo3(signature = (processors_py))]
     fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
         let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
         for n in processors_py.iter() {
diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs
index 3a5a8e60..95a954a2 100644
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -128,7 +128,7 @@ impl From<tk::AddedToken> for PyAddedToken {
 #[pymethods]
 impl PyAddedToken {
     #[new]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (content=None, **kwargs))]
     fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
         let mut token = PyAddedToken::from(content.unwrap_or(""), None);
 
@@ -308,7 +308,7 @@ impl FromPyObject<'_> for PyArrayUnicode {
                     );
                     let py = ob.py();
                     let obj = PyObject::from_owned_ptr(py, unicode);
-                    let s = obj.cast_as::<PyString>(py)?;
+                    let s = obj.downcast::<PyString>(py)?;
                     Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
                 })
                 .collect::<PyResult<Vec<_>>>()?;
@@ -332,7 +332,7 @@ impl FromPyObject<'_> for PyArrayStr {
             .as_array()
             .iter()
             .map(|obj| {
-                let s = obj.cast_as::<PyString>(ob.py())?;
+                let s = obj.downcast::<PyString>(ob.py())?;
                 Ok(s.to_string_lossy().into_owned())
             })
             .collect::<PyResult<Vec<_>>>()?;
@@ -562,7 +562,7 @@ impl PyTokenizer {
     /// Returns:
     ///     :class:`~tokenizers.Tokenizer`: The new tokenizer
     #[staticmethod]
-    #[args(revision = "String::from(\"main\")", auth_token = "None")]
+    #[pyo3(signature = (identifier, revision = String::from("main"), auth_token = None))]
     #[pyo3(text_signature = "(identifier, revision=\"main\", auth_token=None)")]
     fn from_pretrained(
         identifier: &str,
@@ -591,7 +591,7 @@ impl PyTokenizer {
     ///
     /// Returns:
     ///     :obj:`str`: A string representing the serialized Tokenizer
-    #[args(pretty = false)]
+    #[pyo3(signature = (pretty = false))]
     #[pyo3(text_signature = "(self, pretty=False)")]
     fn to_str(&self, pretty: bool) -> PyResult<String> {
         ToPyResult(self.tokenizer.to_string(pretty)).into()
@@ -605,7 +605,7 @@ impl PyTokenizer {
     ///
     ///     pretty (:obj:`bool`, defaults to :obj:`True`):
     ///         Whether the JSON file should be pretty formatted.
-    #[args(pretty = true)]
+    #[pyo3(signature = (path, pretty = true))]
     #[pyo3(text_signature = "(self, path, pretty=True)")]
     fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
         ToPyResult(self.tokenizer.save(path, pretty)).into()
@@ -629,7 +629,7 @@ impl PyTokenizer {
     ///
     /// Returns:
     ///     :obj:`Dict[str, int]`: The vocabulary
-    #[args(with_added_tokens = true)]
+    #[pyo3(signature = (with_added_tokens = true))]
     #[pyo3(text_signature = "(self, with_added_tokens=True)")]
     fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32> {
         self.tokenizer.get_vocab(with_added_tokens)
@@ -643,7 +643,7 @@ impl PyTokenizer {
     ///
     /// Returns:
     ///     :obj:`int`: The size of the vocabulary
-    #[args(with_added_tokens = true)]
+    #[pyo3(signature = (with_added_tokens = true))]
     #[pyo3(text_signature = "(self, with_added_tokens=True)")]
     fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
         self.tokenizer.get_vocab_size(with_added_tokens)
@@ -665,7 +665,7 @@ impl PyTokenizer {
     ///
     ///     direction (:obj:`str`, defaults to :obj:`right`):
     ///         Truncate direction
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (max_length, **kwargs))]
     #[pyo3(
         text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
     )]
@@ -767,7 +767,7 @@ impl PyTokenizer {
     ///     length (:obj:`int`, `optional`):
     ///         If specified, the length at which to pad. If not specified we pad using the size of
     ///         the longest sequence in a batch.
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (**kwargs))]
     #[pyo3(
         text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
     )]
@@ -896,7 +896,7 @@ impl PyTokenizer {
     /// Returns:
     ///     :class:`~tokenizers.Encoding`: The encoded result
     ///
-    #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
+    #[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true))]
     #[pyo3(
         text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"
     )]
@@ -963,7 +963,7 @@ impl PyTokenizer {
     /// Returns:
     ///     A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
     ///
-    #[args(is_pretokenized = "false", add_special_tokens = "true")]
+    #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
     #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
     fn encode_batch(
         &self,
@@ -1006,7 +1006,7 @@ impl PyTokenizer {
     ///
     /// Returns:
     ///     :obj:`str`: The decoded string
-    #[args(skip_special_tokens = true)]
+    #[pyo3(signature = (ids, skip_special_tokens = true))]
     #[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")]
     fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
         ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
@@ -1023,7 +1023,7 @@ impl PyTokenizer {
     ///
     /// Returns:
     ///     :obj:`List[str]`: A list of decoded strings
-    #[args(skip_special_tokens = true)]
+    #[pyo3(signature = (sequences, skip_special_tokens = true))]
     #[pyo3(text_signature = "(self, sequences, skip_special_tokens=True)")]
     fn decode_batch(
         &self,
@@ -1144,7 +1144,7 @@ impl PyTokenizer {
     ///
     ///     trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
     ///         An optional trainer that should be used to train our Model
-    #[args(trainer = "None")]
+    #[pyo3(signature = (files, trainer = None))]
     #[pyo3(text_signature = "(self, files, trainer = None)")]
     fn train(&mut self, files: Vec<String>, trainer: Option<&mut PyTrainer>) -> PyResult<()> {
         let mut trainer =
@@ -1180,7 +1180,7 @@ impl PyTokenizer {
     ///     length (:obj:`int`, `optional`):
     ///         The total number of sequences in the iterator. This is used to
     ///         provide meaningful progress tracking
-    #[args(trainer = "None", length = "None")]
+    #[pyo3(signature = (iterator, trainer = None, length = None))]
     #[pyo3(text_signature = "(self, iterator, trainer=None, length=None)")]
     fn train_from_iterator(
         &mut self,
@@ -1246,7 +1246,7 @@ impl PyTokenizer {
     ///
     /// Returns:
     ///     :class:`~tokenizers.Encoding`: The final post-processed encoding
-    #[args(pair = "None", add_special_tokens = true)]
+    #[pyo3(signature = (encoding, pair = None, add_special_tokens = true))]
     #[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")]
     fn post_process(
         &self,
diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs
index 9892c3f8..b72720f4 100644
--- a/bindings/python/src/trainers.rs
+++ b/bindings/python/src/trainers.rs
@@ -283,7 +283,7 @@ impl PyBpeTrainer {
     }
 
     #[new]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (**kwargs))]
     pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
         let mut builder = tk::models::bpe::BpeTrainer::builder();
         if let Some(kwargs) = kwargs {
@@ -295,7 +295,7 @@ impl PyBpeTrainer {
                     "show_progress" => builder = builder.show_progress(val.extract()?),
                     "special_tokens" => {
                         builder = builder.special_tokens(
-                            val.cast_as::<PyList>()?
+                            val.downcast::<PyList>()?
                                 .into_iter()
                                 .map(|token| {
                                     if let Ok(content) = token.extract::<String>() {
@@ -489,7 +489,7 @@ impl PyWordPieceTrainer {
     }
 
     #[new]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (** kwargs))]
     pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
         let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
         if let Some(kwargs) = kwargs {
@@ -501,7 +501,7 @@ impl PyWordPieceTrainer {
                     "show_progress" => builder = builder.show_progress(val.extract()?),
                     "special_tokens" => {
                         builder = builder.special_tokens(
-                            val.cast_as::<PyList>()?
+                            val.downcast::<PyList>()?
                                 .into_iter()
                                 .map(|token| {
                                     if let Ok(content) = token.extract::<String>() {
@@ -629,7 +629,7 @@ impl PyWordLevelTrainer {
     }
 
     #[new]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (**kwargs))]
     pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
         let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
 
@@ -648,7 +648,7 @@ impl PyWordLevelTrainer {
                     }
                     "special_tokens" => {
                         builder.special_tokens(
-                            val.cast_as::<PyList>()?
+                            val.downcast::<PyList>()?
                                 .into_iter()
                                 .map(|token| {
                                     if let Ok(content) = token.extract::<String>() {
@@ -797,7 +797,7 @@ impl PyUnigramTrainer {
     }
 
     #[new]
-    #[args(kwargs = "**")]
+    #[pyo3(signature = (**kwargs))]
     pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
         let mut builder = tk::models::unigram::UnigramTrainer::builder();
         if let Some(kwargs) = kwargs {
@@ -821,7 +821,7 @@ impl PyUnigramTrainer {
                         )
                     }
                     "special_tokens" => builder.special_tokens(
-                        val.cast_as::<PyList>()?
+                        val.downcast::<PyList>()?
                             .into_iter()
                             .map(|token| {
                                 if let Ok(content) = token.extract::<String>() {
diff --git a/bindings/python/src/utils/pretokenization.rs b/bindings/python/src/utils/pretokenization.rs
index fb692c77..2f8fb00b 100644
--- a/bindings/python/src/utils/pretokenization.rs
+++ b/bindings/python/src/utils/pretokenization.rs
@@ -223,7 +223,7 @@ impl PyPreTokenizedString {
     ///
     /// Returns:
     ///     An Encoding
-    #[args(type_id = "0", word_idx = "None")]
+    #[pyo3(signature = (type_id = 0, word_idx = None))]
     #[pyo3(text_signature = "(self, type_id=0, word_idx=None)")]
     fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> {
         to_encoding(&self.pretok, type_id, word_idx)
@@ -245,10 +245,10 @@ impl PyPreTokenizedString {
     ///
     /// Returns
     ///     A list of splits
-    #[args(
-        offset_referential = "PyOffsetReferential(OffsetReferential::Original)",
-        offset_type = "PyOffsetType(OffsetType::Char)"
-    )]
+    #[pyo3(signature = (
+        offset_referential = PyOffsetReferential(OffsetReferential::Original),
+        offset_type = PyOffsetType(OffsetType::Char)
+    ))]
     #[pyo3(text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")")]
     fn get_splits(
         &self,
@@ -307,17 +307,17 @@ impl PyPreTokenizedStringRefMut {
             .ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
     }
 
-    #[args(type_id = "0", word_idx = "None")]
+    #[pyo3(signature = (type_id = 0, word_idx = None))]
     fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> {
         self.inner
             .map(|pretok| to_encoding(pretok, type_id, word_idx))
             .ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
     }
 
-    #[args(
-        offset_referential = "PyOffsetReferential(OffsetReferential::Original)",
-        offset_type = "PyOffsetType(OffsetType::Char)"
-    )]
+    #[pyo3(signature = (
+        offset_referential = PyOffsetReferential(OffsetReferential::Original),
+        offset_type = PyOffsetType(OffsetType::Char)
+    ))]
     fn get_splits(
         &self,
         offset_referential: PyOffsetReferential,
diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py
index 75011725..2057d763 100644
--- a/bindings/python/tests/test_serialization.py
+++ b/bindings/python/tests/test_serialization.py
@@ -65,7 +65,7 @@ class TestFullDeserialization(unittest.TestCase):
         #             all_models.append((model_id, filename))
 
         all_models = [("HueyNemud/das22-10-camembert_pretrained", "tokenizer.json")]
-        for (model_id, filename) in tqdm.tqdm(all_models):
+        for model_id, filename in tqdm.tqdm(all_models):
             tokenizer_file = cached_download(hf_hub_url(model_id, filename=filename))
 
             is_ok = check(tokenizer_file)