diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index fcd15d28..c69a1252 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]} serde_json = "1.0" libc = "0.2" env_logger = "0.7.1" -pyo3 = "0.18.1" -numpy = "0.18.0" +pyo3 = "0.19" +numpy = "0.19.0" ndarray = "0.13" onig = { version = "6.0", default-features = false } itertools = "0.9" @@ -26,7 +26,7 @@ path = "../../tokenizers" [dev-dependencies] tempfile = "3.1" -pyo3 = { version = "0.18.1", features = ["auto-initialize"] } +pyo3 = { version = "0.19", features = ["auto-initialize"] } [features] default = ["pyo3/extension-module"] diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index f6e0388c..1ba054f5 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -155,12 +155,11 @@ macro_rules! setter { /// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel` /// :class:`~tokenizers.pre_tokenizers.PreTokenizer`. #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteLevel")] -#[pyo3(text_signature = "(self)")] pub struct PyByteLevelDec {} #[pymethods] impl PyByteLevelDec { #[new] - #[pyo3(signature = (**_kwargs))] + #[pyo3(signature = (**_kwargs), text_signature = "(self)")] fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) { (PyByteLevelDec {}, ByteLevel::default().into()) } @@ -171,11 +170,11 @@ impl PyByteLevelDec { /// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace` /// :class:`~tokenizers.pre_tokenizers.PreTokenizer`. #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Replace")] -#[pyo3(text_signature = "(self, pattern, content)")] pub struct PyReplaceDec {} #[pymethods] impl PyReplaceDec { #[new] + #[pyo3(text_signature = "(self, pattern, content)")] fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyDecoder)> { Ok(( PyReplaceDec {}, @@ -194,7 +193,6 @@ impl PyReplaceDec { /// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, /// and some abbreviated english forms. #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "WordPiece")] -#[pyo3(text_signature = "(self, prefix=\"##\", cleanup=True)")] pub struct PyWordPieceDec {} #[pymethods] impl PyWordPieceDec { @@ -219,7 +217,7 @@ impl PyWordPieceDec { } #[new] - #[pyo3(signature = (prefix = String::from("##"), cleanup = true))] + #[pyo3(signature = (prefix = String::from("##"), cleanup = true), text_signature = "(self, prefix=\"##\", cleanup=True)")] fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) { (PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into()) } @@ -231,12 +229,11 @@ impl PyWordPieceDec { /// cannot be decoded you will get � instead for each inconvertable byte token /// #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")] -#[pyo3(text_signature = "(self)")] pub struct PyByteFallbackDec {} #[pymethods] impl PyByteFallbackDec { #[new] - #[pyo3(signature = ())] + #[pyo3(signature = (), text_signature = "(self)")] fn new() -> (Self, PyDecoder) { (PyByteFallbackDec {}, ByteFallback::new().into()) } @@ -247,12 +244,11 @@ impl PyByteFallbackDec { /// This is the last step of decoding, this decoder exists only if /// there is need to add other decoders *after* the fusion #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Fuse")] -#[pyo3(text_signature = "(self)")] pub struct PyFuseDec {} #[pymethods] impl PyFuseDec { #[new] - #[pyo3(signature = ())] + #[pyo3(signature = (), text_signature = "(self)")] fn new() -> (Self, PyDecoder) { (PyFuseDec {}, Fuse::new().into()) } @@ -261,7 +257,6 @@ impl PyFuseDec { /// Strip normalizer /// Strips n left characters of each token, or n right characters of each token #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Strip")] -#[pyo3(text_signature = "(self, content, left=0, right=0)")] pub struct PyStrip {} #[pymethods] impl PyStrip { @@ -296,7 +291,7 @@ impl PyStrip { } #[new] - #[pyo3(signature = (content=' ', left=0, right=0))] + #[pyo3(signature = (content=' ', left=0, right=0), text_signature = "(self, content, left=0, right=0)")] fn new(content: char, left: usize, right: usize) -> (Self, PyDecoder) { (PyStrip {}, Strip::new(content, left, right).into()) } @@ -313,7 +308,6 @@ impl PyStrip { /// Whether to add a space to the first word if there isn't already one. This /// lets us treat `hello` exactly like `say hello`. #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")] -#[pyo3(text_signature = "(self, replacement = \"▁\", add_prefix_space = True)")] pub struct PyMetaspaceDec {} #[pymethods] impl PyMetaspaceDec { @@ -338,7 +332,7 @@ impl PyMetaspaceDec { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true))] + #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true), text_signature = "(self, replacement = \"▁\", add_prefix_space = True)")] fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) { ( PyMetaspaceDec {}, @@ -354,7 +348,6 @@ impl PyMetaspaceDec { /// The suffix that was used to caracterize an end-of-word. This suffix will /// be replaced by whitespaces during the decoding #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")] -#[pyo3(text_signature = "(self, suffix=\"\")")] pub struct PyBPEDecoder {} #[pymethods] impl PyBPEDecoder { @@ -369,7 +362,7 @@ impl PyBPEDecoder { } #[new] - #[pyo3(signature = (suffix = String::from("")))] + #[pyo3(signature = (suffix = String::from("")), text_signature = "(self, suffix=\"\")")] fn new(suffix: String) -> (Self, PyDecoder) { (PyBPEDecoder {}, BPEDecoder::new(suffix).into()) } @@ -386,7 +379,6 @@ impl PyBPEDecoder { /// Whether to cleanup some tokenization artifacts. /// Mainly spaces before punctuation, and some abbreviated english forms. #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "CTC")] -#[pyo3(text_signature = "(self, pad_token=\"\", word_delimiter_token=\"|\", cleanup=True)")] pub struct PyCTCDecoder {} #[pymethods] impl PyCTCDecoder { @@ -425,7 +417,8 @@ impl PyCTCDecoder { pad_token = String::from(""), word_delimiter_token = String::from("|"), cleanup = true - ))] + ), + text_signature = "(self, pad_token=\"\", word_delimiter_token=\"|\", cleanup=True)")] fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) { ( PyCTCDecoder {}, @@ -440,12 +433,11 @@ impl PyCTCDecoder { /// decoders (:obj:`List[Decoder]`) /// The decoders that need to be chained #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name="Sequence")] -#[pyo3(text_signature = "(self, decoders)")] pub struct PySequenceDecoder {} #[pymethods] impl PySequenceDecoder { #[new] - #[pyo3(signature = (decoders_py))] + #[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")] fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> { let mut decoders: Vec = Vec::with_capacity(decoders_py.len()); for decoder_py in decoders_py.iter() { diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs index 4cbd65b8..1ee3aa06 100644 --- a/bindings/python/src/encoding.rs +++ b/bindings/python/src/encoding.rs @@ -23,6 +23,7 @@ impl From for PyEncoding { #[pymethods] impl PyEncoding { #[new] + #[pyo3(text_signature = None)] fn new() -> Self { Self { encoding: tk::tokenizer::Encoding::default(), diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 093d9c4c..0c7bafe4 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -89,6 +89,7 @@ where #[pymethods] impl PyModel { #[new] + #[pyo3(text_signature = None)] fn __new__() -> Self { // Instantiate a default empty model. This doesn't really make sense, but we need // to be able to instantiate an empty model for pickle capabilities. @@ -253,9 +254,6 @@ impl PyModel { /// byte_fallback (:obj:`bool`, `optional`): /// Whether to use spm byte-fallback trick (defaults to False) #[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")] -#[pyo3( - text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)" -)] pub struct PyBPE {} impl PyBPE { @@ -400,7 +398,9 @@ impl PyBPE { } #[new] - #[pyo3(signature = (vocab=None, merges=None, **kwargs))] + #[pyo3( + signature = (vocab=None, merges=None, **kwargs), + text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)")] fn new( py: Python<'_>, vocab: Option, @@ -523,7 +523,6 @@ impl PyBPE { /// max_input_chars_per_word (:obj:`int`, `optional`): /// The maximum number of characters to authorize in a single word. #[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordPiece")] -#[pyo3(text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")] pub struct PyWordPiece {} impl PyWordPiece { @@ -597,7 +596,7 @@ impl PyWordPiece { } #[new] - #[pyo3(signature = (vocab=None, **kwargs))] + #[pyo3(signature = (vocab=None, **kwargs), text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")] fn new( py: Python<'_>, vocab: Option, @@ -692,7 +691,6 @@ impl PyWordPiece { /// unk_token (:obj:`str`, `optional`): /// The unknown token to be used by the model. #[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordLevel")] -#[pyo3(text_signature = "(self, vocab, unk_token)")] pub struct PyWordLevel {} #[pymethods] @@ -708,7 +706,7 @@ impl PyWordLevel { } #[new] - #[pyo3(signature = (vocab=None, unk_token = None))] + #[pyo3(signature = (vocab=None, unk_token = None), text_signature = "(self, vocab, unk_token)")] fn new( py: Python<'_>, vocab: Option, @@ -807,12 +805,12 @@ impl PyWordLevel { /// vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`): /// A list of vocabulary items and their relative score [("am", -0.2442),...] #[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")] -#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")] pub struct PyUnigram {} #[pymethods] impl PyUnigram { #[new] + #[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")] fn new( vocab: Option>, unk_id: Option, diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 442a01c1..954ee5aa 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -217,9 +217,6 @@ macro_rules! setter { /// lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`): /// Whether to lowercase. #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")] -#[pyo3( - text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)" -)] pub struct PyBertNormalizer {} #[pymethods] impl PyBertNormalizer { @@ -274,7 +271,8 @@ impl PyBertNormalizer { handle_chinese_chars = true, strip_accents = None, lowercase = true - ))] + ), + text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)")] fn new( clean_text: bool, handle_chinese_chars: bool, @@ -289,11 +287,11 @@ impl PyBertNormalizer { /// NFD Unicode Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFD")] -#[pyo3(text_signature = "(self)")] pub struct PyNFD {} #[pymethods] impl PyNFD { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyNormalizer) { (PyNFD {}, PyNormalizer::new(NFD.into())) } @@ -301,11 +299,11 @@ impl PyNFD { /// NFKD Unicode Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKD")] -#[pyo3(text_signature = "(self)")] pub struct PyNFKD {} #[pymethods] impl PyNFKD { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyNormalizer) { (PyNFKD {}, NFKD.into()) } @@ -313,11 +311,11 @@ impl PyNFKD { /// NFC Unicode Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFC")] -#[pyo3(text_signature = "(self)")] pub struct PyNFC {} #[pymethods] impl PyNFC { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyNormalizer) { (PyNFC {}, NFC.into()) } @@ -325,11 +323,11 @@ impl PyNFC { /// NFKC Unicode Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKC")] -#[pyo3(text_signature = "(self)")] pub struct PyNFKC {} #[pymethods] impl PyNFKC { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyNormalizer) { (PyNFKC {}, NFKC.into()) } @@ -346,6 +344,7 @@ pub struct PySequence {} #[pymethods] impl PySequence { #[new] + #[pyo3(text_signature = None)] fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> { let mut sequence = Vec::with_capacity(normalizers.len()); for n in normalizers.iter() { @@ -372,11 +371,11 @@ impl PySequence { /// Lowercase Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Lowercase")] -#[pyo3(text_signature = "(self)")] pub struct PyLowercase {} #[pymethods] impl PyLowercase { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyNormalizer) { (PyLowercase {}, Lowercase.into()) } @@ -384,7 +383,6 @@ impl PyLowercase { /// Strip normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Strip")] -#[pyo3(text_signature = "(self, left=True, right=True)")] pub struct PyStrip {} #[pymethods] impl PyStrip { @@ -409,7 +407,7 @@ impl PyStrip { } #[new] - #[pyo3(signature = (left = true, right = true))] + #[pyo3(signature = (left = true, right = true), text_signature = "(self, left=True, right=True)")] fn new(left: bool, right: bool) -> (Self, PyNormalizer) { (PyStrip {}, Strip::new(left, right).into()) } @@ -417,7 +415,6 @@ impl PyStrip { /// Prepend normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")] -#[pyo3(text_signature = "(self, prepend)")] pub struct PyPrepend {} #[pymethods] impl PyPrepend { @@ -432,7 +429,7 @@ impl PyPrepend { } #[new] - #[pyo3(signature = (prepend="▁".to_string()))] + #[pyo3(signature = (prepend="▁".to_string()), text_signature = "(self, prepend)")] fn new(prepend: String) -> (Self, PyNormalizer) { (PyPrepend {}, Prepend::new(prepend).into()) } @@ -440,11 +437,11 @@ impl PyPrepend { /// StripAccents normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")] -#[pyo3(text_signature = "(self)")] pub struct PyStripAccents {} #[pymethods] impl PyStripAccents { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyNormalizer) { (PyStripAccents {}, StripAccents.into()) } @@ -452,11 +449,11 @@ impl PyStripAccents { /// Nmt normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Nmt")] -#[pyo3(text_signature = "(self)")] pub struct PyNmt {} #[pymethods] impl PyNmt { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyNormalizer) { (PyNmt {}, Nmt.into()) } @@ -465,11 +462,11 @@ impl PyNmt { /// Precompiled normalizer /// Don't use manually it is used for compatiblity for SentencePiece. #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")] -#[pyo3(text_signature = "(self, precompiled_charsmap)")] pub struct PyPrecompiled {} #[pymethods] impl PyPrecompiled { #[new] + #[pyo3(text_signature = "(self, precompiled_charsmap)")] fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> { let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?; Ok(( @@ -488,11 +485,11 @@ impl PyPrecompiled { /// Replace normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Replace")] -#[pyo3(text_signature = "(self, pattern, content)")] pub struct PyReplace {} #[pymethods] impl PyReplace { #[new] + #[pyo3(text_signature = "(self, pattern, content)")] fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyNormalizer)> { Ok(( PyReplace {}, diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 98f36a95..699d0bbd 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -235,7 +235,6 @@ macro_rules! setter { /// Set this to :obj:`False` to prevent this `pre_tokenizer` from using /// the GPT2 specific regexp for spliting on whitespace. #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")] -#[pyo3(text_signature = "(self, add_prefix_space=True, use_regex=True)")] pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { @@ -260,7 +259,7 @@ impl PyByteLevel { } #[new] - #[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs))] + #[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs), text_signature = "(self, add_prefix_space=True, use_regex=True)")] fn new( add_prefix_space: bool, use_regex: bool, @@ -295,11 +294,11 @@ impl PyByteLevel { /// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")] -#[pyo3(text_signature = "(self)")] pub struct PyWhitespace {} #[pymethods] impl PyWhitespace { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyPreTokenizer) { (PyWhitespace {}, Whitespace {}.into()) } @@ -307,11 +306,11 @@ impl PyWhitespace { /// This pre-tokenizer simply splits on the whitespace. Works like `.split()` #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "WhitespaceSplit")] -#[pyo3(text_signature = "(self)")] pub struct PyWhitespaceSplit {} #[pymethods] impl PyWhitespaceSplit { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyPreTokenizer) { (PyWhitespaceSplit {}, WhitespaceSplit.into()) } @@ -335,12 +334,11 @@ impl PyWhitespaceSplit { /// invert (:obj:`bool`, `optional`, defaults to :obj:`False`): /// Whether to invert the pattern. #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Split")] -#[pyo3(text_signature = "(self, pattern, behavior, invert=False)")] pub struct PySplit {} #[pymethods] impl PySplit { #[new] - #[pyo3(signature = (pattern, behavior, invert = false))] + #[pyo3(signature = (pattern, behavior, invert = false), text_signature = "(self, pattern, behavior, invert=False)")] fn new( pattern: PyPattern, behavior: PySplitDelimiterBehavior, @@ -379,6 +377,7 @@ impl PyCharDelimiterSplit { } #[new] + #[pyo3(text_signature = None)] pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> { Ok(( PyCharDelimiterSplit {}, @@ -396,11 +395,11 @@ impl PyCharDelimiterSplit { /// This pre-tokenizer splits tokens on spaces, and also on punctuation. /// Each occurence of a punctuation character will be treated separately. #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")] -#[pyo3(text_signature = "(self)")] pub struct PyBertPreTokenizer {} #[pymethods] impl PyBertPreTokenizer { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyPreTokenizer) { (PyBertPreTokenizer {}, BertPreTokenizer.into()) } @@ -414,12 +413,11 @@ impl PyBertPreTokenizer { /// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next", /// "contiguous" #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Punctuation")] -#[pyo3(text_signature = "(self, behavior=\"isolated\")")] pub struct PyPunctuation {} #[pymethods] impl PyPunctuation { #[new] - #[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)))] + #[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)), text_signature = "(self, behavior=\"isolated\")")] fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) { (PyPunctuation {}, Punctuation::new(behavior.into()).into()) } @@ -427,11 +425,11 @@ impl PyPunctuation { /// This pre-tokenizer composes other pre_tokenizers and applies them in sequence #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")] -#[pyo3(text_signature = "(self, pretokenizers)")] pub struct PySequence {} #[pymethods] impl PySequence { #[new] + #[pyo3(text_signature = "(self, pretokenizers)")] fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> { let mut sequence = Vec::with_capacity(pre_tokenizers.len()); for n in pre_tokenizers.iter() { @@ -468,7 +466,6 @@ impl PySequence { /// Whether to add a space to the first word if there isn't already one. This /// lets us treat `hello` exactly like `say hello`. #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")] -#[pyo3(text_signature = "(self, replacement=\"_\", add_prefix_space=True)")] pub struct PyMetaspace {} #[pymethods] impl PyMetaspace { @@ -493,7 +490,7 @@ impl PyMetaspace { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs))] + #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")] fn new( replacement: PyChar, add_prefix_space: bool, @@ -518,7 +515,6 @@ impl PyMetaspace { /// /// "Call 123 please" -> "Call ", "123", " please" #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Digits")] -#[pyo3(text_signature = "(self, individual_digits=False)")] pub struct PyDigits {} #[pymethods] impl PyDigits { @@ -533,7 +529,7 @@ impl PyDigits { } #[new] - #[pyo3(signature = (individual_digits = false))] + #[pyo3(signature = (individual_digits = false), text_signature = "(self, individual_digits=False)")] fn new(individual_digits: bool) -> (Self, PyPreTokenizer) { (PyDigits {}, Digits::new(individual_digits).into()) } @@ -544,11 +540,11 @@ impl PyDigits { /// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. /// This mimicks SentencePiece Unigram implementation. #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "UnicodeScripts")] -#[pyo3(text_signature = "(self)")] pub struct PyUnicodeScripts {} #[pymethods] impl PyUnicodeScripts { #[new] + #[pyo3(text_signature = "(self)")] fn new() -> (Self, PyPreTokenizer) { (PyUnicodeScripts {}, UnicodeScripts::new().into()) } diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 3a2cbdde..3600deec 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -154,11 +154,11 @@ impl PyPostProcessor { /// cls (:obj:`Tuple[str, int]`): /// A tuple with the string representation of the CLS token, and its id #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "BertProcessing")] -#[pyo3(text_signature = "(self, sep, cls)")] pub struct PyBertProcessing {} #[pymethods] impl PyBertProcessing { #[new] + #[pyo3(text_signature = "(self, sep, cls)")] fn new(sep: (String, u32), cls: (String, u32)) -> (Self, PyPostProcessor) { ( PyBertProcessing {}, @@ -196,12 +196,11 @@ impl PyBertProcessing { /// Whether the add_prefix_space option was enabled during pre-tokenization. This /// is relevant because it defines the way the offsets are trimmed out. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "RobertaProcessing")] -#[pyo3(text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")] pub struct PyRobertaProcessing {} #[pymethods] impl PyRobertaProcessing { #[new] - #[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true))] + #[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true), text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")] fn new( sep: (String, u32), cls: (String, u32), @@ -231,12 +230,11 @@ impl PyRobertaProcessing { /// trim_offsets (:obj:`bool`): /// Whether to trim the whitespaces from the produced offsets. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] -#[pyo3(text_signature = "(self, trim_offsets=True)")] pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { #[new] - #[pyo3(signature = (trim_offsets = None, **_kwargs))] + #[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")] fn new(trim_offsets: Option, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) { let mut byte_level = ByteLevel::default(); @@ -383,12 +381,11 @@ impl FromPyObject<'_> for PyTemplate { /// The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have /// the same length. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "TemplateProcessing")] -#[pyo3(text_signature = "(self, single, pair, special_tokens)")] pub struct PyTemplateProcessing {} #[pymethods] impl PyTemplateProcessing { #[new] - #[pyo3(signature = (single = None, pair = None, special_tokens = None))] + #[pyo3(signature = (single = None, pair = None, special_tokens = None), text_signature = "(self, single, pair, special_tokens)")] fn new( single: Option, pair: Option, @@ -422,12 +419,11 @@ impl PyTemplateProcessing { /// processors (:obj:`List[PostProcessor]`) /// The processors that need to be chained #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "Sequence")] -#[pyo3(text_signature = "(self, processors)")] pub struct PySequence {} #[pymethods] impl PySequence { #[new] - #[pyo3(signature = (processors_py))] + #[pyo3(signature = (processors_py), text_signature = "(self, processors)")] fn new(processors_py: &PyList) -> (Self, PyPostProcessor) { let mut processors: Vec = Vec::with_capacity(processors_py.len()); for n in processors_py.iter() { diff --git a/bindings/python/src/token.rs b/bindings/python/src/token.rs index f1db9974..86e92602 100644 --- a/bindings/python/src/token.rs +++ b/bindings/python/src/token.rs @@ -20,6 +20,7 @@ impl From for Token { #[pymethods] impl PyToken { #[new] + #[pyo3(text_signature = None)] fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken { Token::new(id, value, offsets).into() } diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index f66f5d2d..832b993c 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -56,9 +56,6 @@ use crate::utils::{MaybeSizedIterator, PyBufferedIterator}; /// Yesterday"``. /// #[pyclass(dict, module = "tokenizers", name = "AddedToken")] -#[pyo3( - text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)" -)] pub struct PyAddedToken { pub content: String, pub is_special_token: bool, @@ -128,7 +125,7 @@ impl From for PyAddedToken { #[pymethods] impl PyAddedToken { #[new] - #[pyo3(signature = (content=None, **kwargs))] + #[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")] fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult { let mut token = PyAddedToken::from(content.unwrap_or(""), None); @@ -441,7 +438,6 @@ type Tokenizer = TokenizerImpl) -> Self { PyTokenizer::from_model(model.clone()) } diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index 3a22f3b3..98d58165 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -299,7 +299,7 @@ impl PyBpeTrainer { } #[new] - #[pyo3(signature = (**kwargs))] + #[pyo3(signature = (**kwargs), text_signature = None)] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::bpe::BpeTrainer::builder(); if let Some(kwargs) = kwargs { @@ -383,9 +383,6 @@ impl PyBpeTrainer { /// end_of_word_suffix (:obj:`str`, `optional`): /// A suffix to be used for every subword that is a end-of-word. #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")] -#[pyo3( - text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)" -)] pub struct PyWordPieceTrainer {} #[pymethods] impl PyWordPieceTrainer { @@ -506,7 +503,10 @@ impl PyWordPieceTrainer { } #[new] - #[pyo3(signature = (** kwargs))] + #[pyo3( + signature = (** kwargs), + text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)" + )] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordpiece::WordPieceTrainer::builder(); if let Some(kwargs) = kwargs { @@ -646,7 +646,7 @@ impl PyWordLevelTrainer { } #[new] - #[pyo3(signature = (**kwargs))] + #[pyo3(signature = (**kwargs), text_signature = None)] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordlevel::WordLevelTrainer::builder(); @@ -731,9 +731,6 @@ impl PyWordLevelTrainer { /// The number of iterations of the EM algorithm to perform before /// pruning the vocabulary. #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")] -#[pyo3( - text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)" -)] pub struct PyUnigramTrainer {} #[pymethods] impl PyUnigramTrainer { @@ -814,7 +811,10 @@ impl PyUnigramTrainer { } #[new] - #[pyo3(signature = (**kwargs))] + #[pyo3( + signature = (**kwargs), + text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)" + )] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::unigram::UnigramTrainer::builder(); if let Some(kwargs) = kwargs { diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index af696c63..11a06856 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -200,6 +200,7 @@ pub struct PyNormalizedString { #[pymethods] impl PyNormalizedString { #[new] + #[pyo3(text_signature = None)] fn new(s: &str) -> Self { NormalizedString::from(s).into() } diff --git a/bindings/python/src/utils/pretokenization.rs b/bindings/python/src/utils/pretokenization.rs index 2f8fb00b..a93560ab 100644 --- a/bindings/python/src/utils/pretokenization.rs +++ b/bindings/python/src/utils/pretokenization.rs @@ -148,7 +148,6 @@ fn to_encoding( /// sequence: str: /// The string sequence used to initialize this PreTokenizedString #[pyclass(module = "tokenizers", name = "PreTokenizedString")] -#[pyo3(text_signature = "(self, sequence)")] pub struct PyPreTokenizedString { pub(crate) pretok: tk::PreTokenizedString, } @@ -168,6 +167,7 @@ impl From for PreTokenizedString { #[pymethods] impl PyPreTokenizedString { #[new] + #[pyo3(text_signature = "(self, sequence)")] fn new(s: &str) -> Self { PreTokenizedString::from(s).into() } diff --git a/bindings/python/src/utils/regex.rs b/bindings/python/src/utils/regex.rs index 9e0d4240..82893ca7 100644 --- a/bindings/python/src/utils/regex.rs +++ b/bindings/python/src/utils/regex.rs @@ -4,7 +4,6 @@ use pyo3::prelude::*; /// Instantiate a new Regex with the given pattern #[pyclass(module = "tokenizers", name = "Regex")] -#[pyo3(text_signature = "(self, pattern)")] pub struct PyRegex { pub inner: Regex, pub pattern: String, @@ -13,6 +12,7 @@ pub struct PyRegex { #[pymethods] impl PyRegex { #[new] + #[pyo3(text_signature = "(self, pattern)")] fn new(s: &str) -> PyResult { Ok(Self { inner: Regex::new(s)