pyo3: update to 0.19 (#1322)

* Bump pyo3 dependency versions * Fix deprecation warnings from pyo3 --------- Co-authored-by: Mike Lui <mikelui@meta.com>
2025-12-05 04:08:22 +00:00 · 2023-08-16 12:40:32 -04:00
parent 9a93c50c25
commit 540bf2eb01
13 changed files with 68 additions and 89 deletions
--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
 serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.7.1"
-pyo3 = "0.18.1"
+pyo3 = "0.19"
-numpy = "0.18.0"
+numpy = "0.19.0"
 ndarray = "0.13"
 onig = { version = "6.0", default-features = false }
 itertools = "0.9"
@@ -26,7 +26,7 @@ path = "../../tokenizers"
 [dev-dependencies]
 tempfile = "3.1"
-pyo3 = { version = "0.18.1", features = ["auto-initialize"] }
+pyo3 = { version = "0.19", features = ["auto-initialize"] }
 [features]
 default = ["pyo3/extension-module"]
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -155,12 +155,11 @@ macro_rules! setter {
 /// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
 /// :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteLevel")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyByteLevelDec {}
 #[pymethods]
 impl PyByteLevelDec {
    #[new]
-    #[pyo3(signature = (**_kwargs))]
+    #[pyo3(signature = (**_kwargs), text_signature = "(self)")]
    fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) {
        (PyByteLevelDec {}, ByteLevel::default().into())
    }
@@ -171,11 +170,11 @@ impl PyByteLevelDec {
 /// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
 /// :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Replace")]
 #[pyo3(text_signature = "(self, pattern, content)")]
 pub struct PyReplaceDec {}
 #[pymethods]
 impl PyReplaceDec {
    #[new]
    #[pyo3(text_signature = "(self, pattern, content)")]
    fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyDecoder)> {
        Ok((
            PyReplaceDec {},
@@ -194,7 +193,6 @@ impl PyReplaceDec {
 ///         Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
 ///         and some abbreviated english forms.
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "WordPiece")]
 #[pyo3(text_signature = "(self, prefix=\"##\", cleanup=True)")]
 pub struct PyWordPieceDec {}
 #[pymethods]
 impl PyWordPieceDec {
@@ -219,7 +217,7 @@ impl PyWordPieceDec {
    }
    #[new]
-    #[pyo3(signature = (prefix = String::from("##"), cleanup = true))]
+    #[pyo3(signature = (prefix = String::from("##"), cleanup = true), text_signature = "(self, prefix=\"##\", cleanup=True)")]
    fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) {
        (PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into())
    }
@@ -231,12 +229,11 @@ impl PyWordPieceDec {
 /// cannot be decoded you will get <20> instead for each inconvertable byte token
 ///
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyByteFallbackDec {}
 #[pymethods]
 impl PyByteFallbackDec {
    #[new]
-    #[pyo3(signature = ())]
+    #[pyo3(signature = (), text_signature = "(self)")]
    fn new() -> (Self, PyDecoder) {
        (PyByteFallbackDec {}, ByteFallback::new().into())
    }
@@ -247,12 +244,11 @@ impl PyByteFallbackDec {
 /// This is the last step of decoding, this decoder exists only if
 /// there is need to add other decoders *after* the fusion
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Fuse")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyFuseDec {}
 #[pymethods]
 impl PyFuseDec {
    #[new]
-    #[pyo3(signature = ())]
+    #[pyo3(signature = (), text_signature = "(self)")]
    fn new() -> (Self, PyDecoder) {
        (PyFuseDec {}, Fuse::new().into())
    }
@@ -261,7 +257,6 @@ impl PyFuseDec {
 /// Strip normalizer
 /// Strips n left characters of each token, or n right characters of each token
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Strip")]
 #[pyo3(text_signature = "(self, content, left=0, right=0)")]
 pub struct PyStrip {}
 #[pymethods]
 impl PyStrip {
@@ -296,7 +291,7 @@ impl PyStrip {
    }
    #[new]
-    #[pyo3(signature = (content=' ', left=0, right=0))]
+    #[pyo3(signature = (content=' ', left=0, right=0), text_signature = "(self, content, left=0, right=0)")]
    fn new(content: char, left: usize, right: usize) -> (Self, PyDecoder) {
        (PyStrip {}, Strip::new(content, left, right).into())
    }
@@ -313,7 +308,6 @@ impl PyStrip {
 ///         Whether to add a space to the first word if there isn't already one. This
 ///         lets us treat `hello` exactly like `say hello`.
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")]
 #[pyo3(text_signature = "(self, replacement = \"▁\", add_prefix_space = True)")]
 pub struct PyMetaspaceDec {}
 #[pymethods]
 impl PyMetaspaceDec {
@@ -338,7 +332,7 @@ impl PyMetaspaceDec {
    }
    #[new]
-    #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true))]
+    #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true), text_signature = "(self, replacement = \"▁\", add_prefix_space = True)")]
    fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
        (
            PyMetaspaceDec {},
@@ -354,7 +348,6 @@ impl PyMetaspaceDec {
 ///         The suffix that was used to caracterize an end-of-word. This suffix will
 ///         be replaced by whitespaces during the decoding
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
 #[pyo3(text_signature = "(self, suffix=\"</w>\")")]
 pub struct PyBPEDecoder {}
 #[pymethods]
 impl PyBPEDecoder {
@@ -369,7 +362,7 @@ impl PyBPEDecoder {
    }
    #[new]
-    #[pyo3(signature = (suffix = String::from("</w>")))]
+    #[pyo3(signature = (suffix = String::from("</w>")), text_signature = "(self, suffix=\"</w>\")")]
    fn new(suffix: String) -> (Self, PyDecoder) {
        (PyBPEDecoder {}, BPEDecoder::new(suffix).into())
    }
@@ -386,7 +379,6 @@ impl PyBPEDecoder {
 ///         Whether to cleanup some tokenization artifacts.
 ///         Mainly spaces before punctuation, and some abbreviated english forms.
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "CTC")]
 #[pyo3(text_signature = "(self, pad_token=\"<pad>\", word_delimiter_token=\"|\", cleanup=True)")]
 pub struct PyCTCDecoder {}
 #[pymethods]
 impl PyCTCDecoder {
@@ -425,7 +417,8 @@ impl PyCTCDecoder {
        pad_token = String::from("<pad>"),
        word_delimiter_token = String::from("|"),
        cleanup = true
-    ))]
+    ),
        text_signature = "(self, pad_token=\"<pad>\", word_delimiter_token=\"|\", cleanup=True)")]
    fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) {
        (
            PyCTCDecoder {},
@@ -440,12 +433,11 @@ impl PyCTCDecoder {
 ///     decoders (:obj:`List[Decoder]`)
 ///         The decoders that need to be chained
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name="Sequence")]
 #[pyo3(text_signature = "(self, decoders)")]
 pub struct PySequenceDecoder {}
 #[pymethods]
 impl PySequenceDecoder {
    #[new]
-    #[pyo3(signature = (decoders_py))]
+    #[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")]
    fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
        let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
        for decoder_py in decoders_py.iter() {
--- a/bindings/python/src/encoding.rs
+++ b/bindings/python/src/encoding.rs
@@ -23,6 +23,7 @@ impl From<tk::tokenizer::Encoding> for PyEncoding {
 #[pymethods]
 impl PyEncoding {
    #[new]
    #[pyo3(text_signature = None)]
    fn new() -> Self {
        Self {
            encoding: tk::tokenizer::Encoding::default(),
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@@ -89,6 +89,7 @@ where
 #[pymethods]
 impl PyModel {
    #[new]
    #[pyo3(text_signature = None)]
    fn __new__() -> Self {
        // Instantiate a default empty model. This doesn't really make sense, but we need
        // to be able to instantiate an empty model for pickle capabilities.
@@ -253,9 +254,6 @@ impl PyModel {
 ///     byte_fallback (:obj:`bool`, `optional`):
 ///         Whether to use spm byte-fallback trick (defaults to False)
 #[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")]
 #[pyo3(
    text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)"
 )]
 pub struct PyBPE {}
 impl PyBPE {
@@ -400,7 +398,9 @@ impl PyBPE {
    }
    #[new]
-    #[pyo3(signature = (vocab=None, merges=None, **kwargs))]
+    #[pyo3(
        signature = (vocab=None, merges=None, **kwargs),
        text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)")]
    fn new(
        py: Python<'_>,
        vocab: Option<PyVocab>,
@@ -523,7 +523,6 @@ impl PyBPE {
 ///     max_input_chars_per_word (:obj:`int`, `optional`):
 ///         The maximum number of characters to authorize in a single word.
 #[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordPiece")]
 #[pyo3(text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
 pub struct PyWordPiece {}
 impl PyWordPiece {
@@ -597,7 +596,7 @@ impl PyWordPiece {
    }
    #[new]
-    #[pyo3(signature = (vocab=None, **kwargs))]
+    #[pyo3(signature = (vocab=None, **kwargs), text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
    fn new(
        py: Python<'_>,
        vocab: Option<PyVocab>,
@@ -692,7 +691,6 @@ impl PyWordPiece {
 ///     unk_token (:obj:`str`, `optional`):
 ///         The unknown token to be used by the model.
 #[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordLevel")]
 #[pyo3(text_signature = "(self, vocab, unk_token)")]
 pub struct PyWordLevel {}
 #[pymethods]
@@ -708,7 +706,7 @@ impl PyWordLevel {
    }
    #[new]
-    #[pyo3(signature = (vocab=None, unk_token = None))]
+    #[pyo3(signature = (vocab=None, unk_token = None), text_signature = "(self, vocab, unk_token)")]
    fn new(
        py: Python<'_>,
        vocab: Option<PyVocab>,
@@ -807,12 +805,12 @@ impl PyWordLevel {
 ///     vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
 ///         A list of vocabulary items and their relative score [("am", -0.2442),...]
 #[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")]
 #[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
 pub struct PyUnigram {}
 #[pymethods]
 impl PyUnigram {
    #[new]
    #[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
    fn new(
        vocab: Option<Vec<(String, f64)>>,
        unk_id: Option<usize>,
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -217,9 +217,6 @@ macro_rules! setter {
 ///     lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
 ///         Whether to lowercase.
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")]
 #[pyo3(
    text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"
 )]
 pub struct PyBertNormalizer {}
 #[pymethods]
 impl PyBertNormalizer {
@@ -274,7 +271,8 @@ impl PyBertNormalizer {
        handle_chinese_chars = true,
        strip_accents = None,
        lowercase = true
-    ))]
+    ),
        text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)")]
    fn new(
        clean_text: bool,
        handle_chinese_chars: bool,
@@ -289,11 +287,11 @@ impl PyBertNormalizer {
 /// NFD Unicode Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFD")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyNFD {}
 #[pymethods]
 impl PyNFD {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyNormalizer) {
        (PyNFD {}, PyNormalizer::new(NFD.into()))
    }
@@ -301,11 +299,11 @@ impl PyNFD {
 /// NFKD Unicode Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKD")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyNFKD {}
 #[pymethods]
 impl PyNFKD {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyNormalizer) {
        (PyNFKD {}, NFKD.into())
    }
@@ -313,11 +311,11 @@ impl PyNFKD {
 /// NFC Unicode Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFC")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyNFC {}
 #[pymethods]
 impl PyNFC {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyNormalizer) {
        (PyNFC {}, NFC.into())
    }
@@ -325,11 +323,11 @@ impl PyNFC {
 /// NFKC Unicode Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKC")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyNFKC {}
 #[pymethods]
 impl PyNFKC {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyNormalizer) {
        (PyNFKC {}, NFKC.into())
    }
@@ -346,6 +344,7 @@ pub struct PySequence {}
 #[pymethods]
 impl PySequence {
    #[new]
    #[pyo3(text_signature = None)]
    fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> {
        let mut sequence = Vec::with_capacity(normalizers.len());
        for n in normalizers.iter() {
@@ -372,11 +371,11 @@ impl PySequence {
 /// Lowercase Normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Lowercase")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyLowercase {}
 #[pymethods]
 impl PyLowercase {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyNormalizer) {
        (PyLowercase {}, Lowercase.into())
    }
@@ -384,7 +383,6 @@ impl PyLowercase {
 /// Strip normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Strip")]
 #[pyo3(text_signature = "(self, left=True, right=True)")]
 pub struct PyStrip {}
 #[pymethods]
 impl PyStrip {
@@ -409,7 +407,7 @@ impl PyStrip {
    }
    #[new]
-    #[pyo3(signature = (left = true, right = true))]
+    #[pyo3(signature = (left = true, right = true), text_signature = "(self, left=True, right=True)")]
    fn new(left: bool, right: bool) -> (Self, PyNormalizer) {
        (PyStrip {}, Strip::new(left, right).into())
    }
@@ -417,7 +415,6 @@ impl PyStrip {
 /// Prepend normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")]
 #[pyo3(text_signature = "(self, prepend)")]
 pub struct PyPrepend {}
 #[pymethods]
 impl PyPrepend {
@@ -432,7 +429,7 @@ impl PyPrepend {
    }
    #[new]
-    #[pyo3(signature = (prepend="▁".to_string()))]
+    #[pyo3(signature = (prepend="▁".to_string()), text_signature = "(self, prepend)")]
    fn new(prepend: String) -> (Self, PyNormalizer) {
        (PyPrepend {}, Prepend::new(prepend).into())
    }
@@ -440,11 +437,11 @@ impl PyPrepend {
 /// StripAccents normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyStripAccents {}
 #[pymethods]
 impl PyStripAccents {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyNormalizer) {
        (PyStripAccents {}, StripAccents.into())
    }
@@ -452,11 +449,11 @@ impl PyStripAccents {
 /// Nmt normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Nmt")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyNmt {}
 #[pymethods]
 impl PyNmt {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyNormalizer) {
        (PyNmt {}, Nmt.into())
    }
@@ -465,11 +462,11 @@ impl PyNmt {
 /// Precompiled normalizer
 /// Don't use manually it is used for compatiblity for SentencePiece.
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
 #[pyo3(text_signature = "(self, precompiled_charsmap)")]
 pub struct PyPrecompiled {}
 #[pymethods]
 impl PyPrecompiled {
    #[new]
    #[pyo3(text_signature = "(self, precompiled_charsmap)")]
    fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> {
        let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?;
        Ok((
@@ -488,11 +485,11 @@ impl PyPrecompiled {
 /// Replace normalizer
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Replace")]
 #[pyo3(text_signature = "(self, pattern, content)")]
 pub struct PyReplace {}
 #[pymethods]
 impl PyReplace {
    #[new]
    #[pyo3(text_signature = "(self, pattern, content)")]
    fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyNormalizer)> {
        Ok((
            PyReplace {},
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -235,7 +235,6 @@ macro_rules! setter {
 ///         Set this to :obj:`False` to prevent this `pre_tokenizer` from using
 ///         the GPT2 specific regexp for spliting on whitespace.
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")]
 #[pyo3(text_signature = "(self, add_prefix_space=True, use_regex=True)")]
 pub struct PyByteLevel {}
 #[pymethods]
 impl PyByteLevel {
@@ -260,7 +259,7 @@ impl PyByteLevel {
    }
    #[new]
-    #[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs))]
+    #[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs), text_signature = "(self, add_prefix_space=True, use_regex=True)")]
    fn new(
        add_prefix_space: bool,
        use_regex: bool,
@@ -295,11 +294,11 @@ impl PyByteLevel {
 /// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyWhitespace {}
 #[pymethods]
 impl PyWhitespace {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyPreTokenizer) {
        (PyWhitespace {}, Whitespace {}.into())
    }
@@ -307,11 +306,11 @@ impl PyWhitespace {
 /// This pre-tokenizer simply splits on the whitespace. Works like `.split()`
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "WhitespaceSplit")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyWhitespaceSplit {}
 #[pymethods]
 impl PyWhitespaceSplit {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyPreTokenizer) {
        (PyWhitespaceSplit {}, WhitespaceSplit.into())
    }
@@ -335,12 +334,11 @@ impl PyWhitespaceSplit {
 ///     invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
 ///         Whether to invert the pattern.
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Split")]
 #[pyo3(text_signature = "(self, pattern, behavior, invert=False)")]
 pub struct PySplit {}
 #[pymethods]
 impl PySplit {
    #[new]
-    #[pyo3(signature = (pattern, behavior, invert = false))]
+    #[pyo3(signature = (pattern, behavior, invert = false), text_signature = "(self, pattern, behavior, invert=False)")]
    fn new(
        pattern: PyPattern,
        behavior: PySplitDelimiterBehavior,
@@ -379,6 +377,7 @@ impl PyCharDelimiterSplit {
    }
    #[new]
    #[pyo3(text_signature = None)]
    pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
        Ok((
            PyCharDelimiterSplit {},
@@ -396,11 +395,11 @@ impl PyCharDelimiterSplit {
 /// This pre-tokenizer splits tokens on spaces, and also on punctuation.
 /// Each occurence of a punctuation character will be treated separately.
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyBertPreTokenizer {}
 #[pymethods]
 impl PyBertPreTokenizer {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyPreTokenizer) {
        (PyBertPreTokenizer {}, BertPreTokenizer.into())
    }
@@ -414,12 +413,11 @@ impl PyBertPreTokenizer {
 ///         Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
 ///         "contiguous"
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Punctuation")]
 #[pyo3(text_signature = "(self, behavior=\"isolated\")")]
 pub struct PyPunctuation {}
 #[pymethods]
 impl PyPunctuation {
    #[new]
-    #[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)))]
+    #[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)), text_signature = "(self, behavior=\"isolated\")")]
    fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
        (PyPunctuation {}, Punctuation::new(behavior.into()).into())
    }
@@ -427,11 +425,11 @@ impl PyPunctuation {
 /// This pre-tokenizer composes other pre_tokenizers and applies them in sequence
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")]
 #[pyo3(text_signature = "(self, pretokenizers)")]
 pub struct PySequence {}
 #[pymethods]
 impl PySequence {
    #[new]
    #[pyo3(text_signature = "(self, pretokenizers)")]
    fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> {
        let mut sequence = Vec::with_capacity(pre_tokenizers.len());
        for n in pre_tokenizers.iter() {
@@ -468,7 +466,6 @@ impl PySequence {
 ///         Whether to add a space to the first word if there isn't already one. This
 ///         lets us treat `hello` exactly like `say hello`.
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")]
 #[pyo3(text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
 pub struct PyMetaspace {}
 #[pymethods]
 impl PyMetaspace {
@@ -493,7 +490,7 @@ impl PyMetaspace {
    }
    #[new]
-    #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs))]
+    #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
    fn new(
        replacement: PyChar,
        add_prefix_space: bool,
@@ -518,7 +515,6 @@ impl PyMetaspace {
 ///
 ///             "Call 123 please" -> "Call ", "123", " please"
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Digits")]
 #[pyo3(text_signature = "(self, individual_digits=False)")]
 pub struct PyDigits {}
 #[pymethods]
 impl PyDigits {
@@ -533,7 +529,7 @@ impl PyDigits {
    }
    #[new]
-    #[pyo3(signature = (individual_digits = false))]
+    #[pyo3(signature = (individual_digits = false), text_signature = "(self, individual_digits=False)")]
    fn new(individual_digits: bool) -> (Self, PyPreTokenizer) {
        (PyDigits {}, Digits::new(individual_digits).into())
    }
@@ -544,11 +540,11 @@ impl PyDigits {
 /// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
 /// This mimicks SentencePiece Unigram implementation.
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "UnicodeScripts")]
 #[pyo3(text_signature = "(self)")]
 pub struct PyUnicodeScripts {}
 #[pymethods]
 impl PyUnicodeScripts {
    #[new]
    #[pyo3(text_signature = "(self)")]
    fn new() -> (Self, PyPreTokenizer) {
        (PyUnicodeScripts {}, UnicodeScripts::new().into())
    }
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -154,11 +154,11 @@ impl PyPostProcessor {
 ///     cls (:obj:`Tuple[str, int]`):
 ///         A tuple with the string representation of the CLS token, and its id
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "BertProcessing")]
 #[pyo3(text_signature = "(self, sep, cls)")]
 pub struct PyBertProcessing {}
 #[pymethods]
 impl PyBertProcessing {
    #[new]
    #[pyo3(text_signature = "(self, sep, cls)")]
    fn new(sep: (String, u32), cls: (String, u32)) -> (Self, PyPostProcessor) {
        (
            PyBertProcessing {},
@@ -196,12 +196,11 @@ impl PyBertProcessing {
 ///         Whether the add_prefix_space option was enabled during pre-tokenization. This
 ///         is relevant because it defines the way the offsets are trimmed out.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "RobertaProcessing")]
 #[pyo3(text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")]
 pub struct PyRobertaProcessing {}
 #[pymethods]
 impl PyRobertaProcessing {
    #[new]
-    #[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true))]
+    #[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true), text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")]
    fn new(
        sep: (String, u32),
        cls: (String, u32),
@@ -231,12 +230,11 @@ impl PyRobertaProcessing {
 ///     trim_offsets (:obj:`bool`):
 ///         Whether to trim the whitespaces from the produced offsets.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
 #[pyo3(text_signature = "(self, trim_offsets=True)")]
 pub struct PyByteLevel {}
 #[pymethods]
 impl PyByteLevel {
    #[new]
-    #[pyo3(signature = (trim_offsets = None, **_kwargs))]
+    #[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
    fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
        let mut byte_level = ByteLevel::default();
@@ -383,12 +381,11 @@ impl FromPyObject<'_> for PyTemplate {
 ///          The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
 ///          the same length.
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "TemplateProcessing")]
 #[pyo3(text_signature = "(self, single, pair, special_tokens)")]
 pub struct PyTemplateProcessing {}
 #[pymethods]
 impl PyTemplateProcessing {
    #[new]
-    #[pyo3(signature = (single = None, pair = None, special_tokens = None))]
+    #[pyo3(signature = (single = None, pair = None, special_tokens = None), text_signature = "(self, single, pair, special_tokens)")]
    fn new(
        single: Option<PyTemplate>,
        pair: Option<PyTemplate>,
@@ -422,12 +419,11 @@ impl PyTemplateProcessing {
 ///     processors (:obj:`List[PostProcessor]`)
 ///         The processors that need to be chained
 #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "Sequence")]
 #[pyo3(text_signature = "(self, processors)")]
 pub struct PySequence {}
 #[pymethods]
 impl PySequence {
    #[new]
-    #[pyo3(signature = (processors_py))]
+    #[pyo3(signature = (processors_py), text_signature = "(self, processors)")]
    fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
        let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
        for n in processors_py.iter() {
--- a/bindings/python/src/token.rs
+++ b/bindings/python/src/token.rs
@@ -20,6 +20,7 @@ impl From<PyToken> for Token {
 #[pymethods]
 impl PyToken {
    #[new]
    #[pyo3(text_signature = None)]
    fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken {
        Token::new(id, value, offsets).into()
    }
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -56,9 +56,6 @@ use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
 ///         Yesterday"``.
 ///
 #[pyclass(dict, module = "tokenizers", name = "AddedToken")]
 #[pyo3(
    text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"
 )]
 pub struct PyAddedToken {
    pub content: String,
    pub is_special_token: bool,
@@ -128,7 +125,7 @@ impl From<tk::AddedToken> for PyAddedToken {
 #[pymethods]
 impl PyAddedToken {
    #[new]
-    #[pyo3(signature = (content=None, **kwargs))]
+    #[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")]
    fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
        let mut token = PyAddedToken::from(content.unwrap_or(""), None);
@@ -441,7 +438,6 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
 ///         The core algorithm that this :obj:`Tokenizer` should be using.
 ///
 #[pyclass(dict, module = "tokenizers", name = "Tokenizer")]
 #[pyo3(text_signature = "(self, model)")]
 #[derive(Clone)]
 pub struct PyTokenizer {
    tokenizer: Tokenizer,
@@ -460,6 +456,7 @@ impl PyTokenizer {
 #[pymethods]
 impl PyTokenizer {
    #[new]
    #[pyo3(text_signature = "(self, model)")]
    fn __new__(model: PyRef<PyModel>) -> Self {
        PyTokenizer::from_model(model.clone())
    }
--- a/bindings/python/src/trainers.rs
+++ b/bindings/python/src/trainers.rs
@@ -299,7 +299,7 @@ impl PyBpeTrainer {
    }
    #[new]
-    #[pyo3(signature = (**kwargs))]
+    #[pyo3(signature = (**kwargs), text_signature = None)]
    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
        let mut builder = tk::models::bpe::BpeTrainer::builder();
        if let Some(kwargs) = kwargs {
@@ -383,9 +383,6 @@ impl PyBpeTrainer {
 ///     end_of_word_suffix (:obj:`str`, `optional`):
 ///         A suffix to be used for every subword that is a end-of-word.
 #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")]
 #[pyo3(
    text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
 )]
 pub struct PyWordPieceTrainer {}
 #[pymethods]
 impl PyWordPieceTrainer {
@@ -506,7 +503,10 @@ impl PyWordPieceTrainer {
    }
    #[new]
-    #[pyo3(signature = (** kwargs))]
+    #[pyo3(
        signature = (** kwargs),
        text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
    )]
    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
        let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
        if let Some(kwargs) = kwargs {
@@ -646,7 +646,7 @@ impl PyWordLevelTrainer {
    }
    #[new]
-    #[pyo3(signature = (**kwargs))]
+    #[pyo3(signature = (**kwargs), text_signature = None)]
    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
        let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
@@ -731,9 +731,6 @@ impl PyWordLevelTrainer {
 ///         The number of iterations of the EM algorithm to perform before
 ///         pruning the vocabulary.
 #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")]
 #[pyo3(
    text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
 )]
 pub struct PyUnigramTrainer {}
 #[pymethods]
 impl PyUnigramTrainer {
@@ -814,7 +811,10 @@ impl PyUnigramTrainer {
    }
    #[new]
-    #[pyo3(signature = (**kwargs))]
+    #[pyo3(
        signature = (**kwargs),
        text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
    )]
    pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
        let mut builder = tk::models::unigram::UnigramTrainer::builder();
        if let Some(kwargs) = kwargs {
--- a/bindings/python/src/utils/normalization.rs
+++ b/bindings/python/src/utils/normalization.rs
@@ -200,6 +200,7 @@ pub struct PyNormalizedString {
 #[pymethods]
 impl PyNormalizedString {
    #[new]
    #[pyo3(text_signature = None)]
    fn new(s: &str) -> Self {
        NormalizedString::from(s).into()
    }
--- a/bindings/python/src/utils/pretokenization.rs
+++ b/bindings/python/src/utils/pretokenization.rs
@@ -148,7 +148,6 @@ fn to_encoding(
 ///     sequence: str:
 ///         The string sequence used to initialize this PreTokenizedString
 #[pyclass(module = "tokenizers", name = "PreTokenizedString")]
 #[pyo3(text_signature = "(self, sequence)")]
 pub struct PyPreTokenizedString {
    pub(crate) pretok: tk::PreTokenizedString,
 }
@@ -168,6 +167,7 @@ impl From<PyPreTokenizedString> for PreTokenizedString {
 #[pymethods]
 impl PyPreTokenizedString {
    #[new]
    #[pyo3(text_signature = "(self, sequence)")]
    fn new(s: &str) -> Self {
        PreTokenizedString::from(s).into()
    }
--- a/bindings/python/src/utils/regex.rs
+++ b/bindings/python/src/utils/regex.rs
@@ -4,7 +4,6 @@ use pyo3::prelude::*;
 /// Instantiate a new Regex with the given pattern
 #[pyclass(module = "tokenizers", name = "Regex")]
 #[pyo3(text_signature = "(self, pattern)")]
 pub struct PyRegex {
    pub inner: Regex,
    pub pattern: String,
@@ -13,6 +12,7 @@ pub struct PyRegex {
 #[pymethods]
 impl PyRegex {
    #[new]
    #[pyo3(text_signature = "(self, pattern)")]
    fn new(s: &str) -> PyResult<Self> {
        Ok(Self {
            inner: Regex::new(s)