From 5c18ec5ff5088a1d9eef77086321f94052cf7af0 Mon Sep 17 00:00:00 2001 From: mert-kurttutan Date: Wed, 8 Mar 2023 11:27:47 +0100 Subject: [PATCH] pyo3 v0.18 migration (#1173) * pyo v0.18 migration * Fix formatting issues of black --- bindings/python/Cargo.toml | 6 +- bindings/python/examples/custom_components.py | 2 +- .../python/examples/train_with_datasets.py | 1 + .../implementations/bert_wordpiece.py | 1 - .../py_src/tokenizers/models/__init__.pyi | 55 +++++++++++++++++++ bindings/python/src/decoders.rs | 21 +++---- bindings/python/src/encoding.rs | 15 +++-- bindings/python/src/models.rs | 13 +++-- bindings/python/src/normalizers.rs | 14 ++--- bindings/python/src/pre_tokenizers.rs | 10 ++-- bindings/python/src/processors.rs | 10 ++-- bindings/python/src/tokenizer.rs | 34 ++++++------ bindings/python/src/trainers.rs | 16 +++--- bindings/python/src/utils/pretokenization.rs | 20 +++---- bindings/python/tests/test_serialization.py | 2 +- 15 files changed, 138 insertions(+), 82 deletions(-) diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 86b1b036..5f7c6c55 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]} serde_json = "1.0" libc = "0.2" env_logger = "0.7.1" -pyo3 = "0.17.2" -numpy = "0.17.2" +pyo3 = "0.18.1" +numpy = "0.18.0" ndarray = "0.13" onig = { version = "6.0", default-features = false } itertools = "0.9" @@ -26,7 +26,7 @@ path = "../../tokenizers" [dev-dependencies] tempfile = "3.1" -pyo3 = { version = "0.17.2", features = ["auto-initialize"] } +pyo3 = { version = "0.18.1", features = ["auto-initialize"] } [features] default = ["pyo3/extension-module"] diff --git a/bindings/python/examples/custom_components.py b/bindings/python/examples/custom_components.py index 0570ec2b..cdb97309 100644 --- a/bindings/python/examples/custom_components.py +++ b/bindings/python/examples/custom_components.py @@ -24,7 +24,7 @@ class JiebaPreTokenizer: # Just an odd example... splits = [] last = 0 - for (i, char) in enumerate(str(normalized_string)): + for i, char in enumerate(str(normalized_string)): if char.isnumeric() and int(char) % 2 == 1: splits.append(normalized_string[last:i]) last = i diff --git a/bindings/python/examples/train_with_datasets.py b/bindings/python/examples/train_with_datasets.py index e169b502..7f95ccd2 100644 --- a/bindings/python/examples/train_with_datasets.py +++ b/bindings/python/examples/train_with_datasets.py @@ -11,6 +11,7 @@ bpe_tokenizer.normalizer = normalizers.Lowercase() # Initialize a dataset dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1") + # Build an iterator over this dataset def batch_iterator(): batch_length = 1000 diff --git a/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py b/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py index c7fe6762..1f34e3ca 100644 --- a/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py +++ b/bindings/python/py_src/tokenizers/implementations/bert_wordpiece.py @@ -26,7 +26,6 @@ class BertWordPieceTokenizer(BaseTokenizer): lowercase: bool = True, wordpieces_prefix: str = "##", ): - if vocab is not None: tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token))) else: diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi index a471ba81..d75c7c7e 100644 --- a/bindings/python/py_src/tokenizers/models/__init__.pyi +++ b/bindings/python/py_src/tokenizers/models/__init__.pyi @@ -9,6 +9,17 @@ class Model: This class cannot be constructed directly. Please use one of the concrete models. """ + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass def id_to_token(self, id): """ Get the token associated to an ID @@ -134,6 +145,17 @@ class BPE(Model): :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files """ pass + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass def id_to_token(self, id): """ Get the token associated to an ID @@ -222,6 +244,17 @@ class Unigram(Model): def __init__(self, vocab): pass + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass def id_to_token(self, id): """ Get the token associated to an ID @@ -316,6 +349,17 @@ class WordLevel(Model): :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file """ pass + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass def id_to_token(self, id): """ Get the token associated to an ID @@ -428,6 +472,17 @@ class WordPiece(Model): :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file """ pass + def get_trainer(self): + """ + Get the associated :class:`~tokenizers.trainers.Trainer` + + Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this + :class:`~tokenizers.models.Model`. + + Returns: + :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + """ + pass def id_to_token(self, id): """ Get the token associated to an ID diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index b6f8c031..5c3e4487 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -149,7 +149,8 @@ pub struct PyByteLevelDec {} #[pymethods] impl PyByteLevelDec { #[new] - fn new() -> (Self, PyDecoder) { + #[pyo3(signature = (**_kwargs))] + fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) { (PyByteLevelDec {}, ByteLevel::default().into()) } } @@ -189,7 +190,7 @@ impl PyWordPieceDec { } #[new] - #[args(prefix = "String::from(\"##\")", cleanup = "true")] + #[pyo3(signature = (prefix = String::from("##"), cleanup = true))] fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) { (PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into()) } @@ -231,7 +232,7 @@ impl PyMetaspaceDec { } #[new] - #[args(replacement = "PyChar('▁')", add_prefix_space = "true")] + #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true))] fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) { ( PyMetaspaceDec {}, @@ -262,7 +263,7 @@ impl PyBPEDecoder { } #[new] - #[args(suffix = "String::from(\"\")")] + #[pyo3(signature = (suffix = String::from("")))] fn new(suffix: String) -> (Self, PyDecoder) { (PyBPEDecoder {}, BPEDecoder::new(suffix).into()) } @@ -314,11 +315,11 @@ impl PyCTCDecoder { } #[new] - #[args( - pad_token = "String::from(\"\")", - word_delimiter_token = "String::from(\"|\")", - cleanup = "true" - )] + #[pyo3(signature = ( + pad_token = String::from(""), + word_delimiter_token = String::from("|"), + cleanup = true + ))] fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) { ( PyCTCDecoder {}, @@ -338,7 +339,7 @@ pub struct PySequenceDecoder {} #[pymethods] impl PySequenceDecoder { #[new] - #[args(decoders)] + #[pyo3(signature = (decoders_py))] fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> { let mut decoders: Vec = Vec::with_capacity(decoders_py.len()); for decoder_py in decoders_py.iter() { diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs index 1b5c0b85..4cbd65b8 100644 --- a/bindings/python/src/encoding.rs +++ b/bindings/python/src/encoding.rs @@ -78,7 +78,7 @@ impl PyEncoding { /// Returns: /// :class:`~tokenizers.Encoding`: The resulting Encoding #[staticmethod] - #[args(growing_offsets = true)] + #[pyo3(signature = (encodings, growing_offsets = true))] #[pyo3(text_signature = "(encodings, growing_offsets=True)")] fn merge(encodings: Vec>, growing_offsets: bool) -> PyEncoding { tk::tokenizer::Encoding::merge( @@ -263,7 +263,7 @@ impl PyEncoding { /// /// Returns: /// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` - #[args(sequence_index = 0)] + #[pyo3(signature = (word_index, sequence_index = 0))] #[pyo3(text_signature = "(self, word_index, sequence_index=0)")] fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> { self.encoding.word_to_tokens(word_index, sequence_index) @@ -279,7 +279,7 @@ impl PyEncoding { /// /// Returns: /// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` - #[args(sequence_index = 0)] + #[pyo3(signature = (word_index, sequence_index = 0))] #[pyo3(text_signature = "(self, word_index, sequence_index=0)")] fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option { self.encoding.word_to_chars(word_index, sequence_index) @@ -347,7 +347,7 @@ impl PyEncoding { /// /// Returns: /// :obj:`int`: The index of the token that contains this char in the encoded sequence - #[args(sequence_index = 0)] + #[pyo3(signature = (char_pos, sequence_index = 0))] #[pyo3(text_signature = "(self, char_pos, sequence_index=0)")] fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option { self.encoding.char_to_token(char_pos, sequence_index) @@ -363,7 +363,7 @@ impl PyEncoding { /// /// Returns: /// :obj:`int`: The index of the word that contains this char in the input sequence - #[args(sequence_index = 0)] + #[pyo3(signature = (char_pos, sequence_index = 0))] #[pyo3(text_signature = "(self, char_pos, sequence_index=0)")] fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option { self.encoding.char_to_word(char_pos, sequence_index) @@ -386,7 +386,7 @@ impl PyEncoding { /// /// pad_token (:obj:`str`, defaults to `[PAD]`): /// The pad token to use - #[args(kwargs = "**")] + #[pyo3(signature = (length, **kwargs))] #[pyo3( text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')" )] @@ -439,8 +439,7 @@ impl PyEncoding { /// /// direction (:obj:`str`, defaults to :obj:`right`): /// Truncate direction - #[args(stride = "0")] - #[args(direction = "\"right\"")] + #[pyo3(signature = (max_length, stride = 0, direction = "right"))] #[pyo3(text_signature = "(self, max_length, stride=0, direction='right')")] fn truncate(&mut self, max_length: usize, stride: usize, direction: &str) -> PyResult<()> { let tdir = match direction { diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index c1848ed7..22a518dd 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -215,6 +215,7 @@ impl PyModel { /// /// Returns: /// :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model + #[pyo3(text_signature = "(self)")] fn get_trainer(&self, py: Python<'_>) -> PyResult { PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype(py) } @@ -385,7 +386,7 @@ impl PyBPE { } #[new] - #[args(kwargs = "**")] + #[pyo3(signature = (vocab=None, merges=None, **kwargs))] fn new( py: Python<'_>, vocab: Option, @@ -472,7 +473,7 @@ impl PyBPE { /// Returns: /// :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files #[classmethod] - #[args(kwargs = "**")] + #[pyo3(signature = (vocab, merges, **kwargs))] #[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")] fn from_file( _cls: &PyType, @@ -582,7 +583,7 @@ impl PyWordPiece { } #[new] - #[args(kwargs = "**")] + #[pyo3(signature = (vocab=None, **kwargs))] fn new( py: Python<'_>, vocab: Option, @@ -648,7 +649,7 @@ impl PyWordPiece { /// Returns: /// :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file #[classmethod] - #[args(kwargs = "**")] + #[pyo3(signature = (vocab, **kwargs))] #[pyo3(text_signature = "(vocab, **kwargs)")] fn from_file( _cls: &PyType, @@ -693,7 +694,7 @@ impl PyWordLevel { } #[new] - #[args(unk_token = "None")] + #[pyo3(signature = (vocab=None, unk_token = None))] fn new( py: Python<'_>, vocab: Option, @@ -768,7 +769,7 @@ impl PyWordLevel { /// Returns: /// :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file #[classmethod] - #[args(unk_token = "None")] + #[pyo3(signature = (vocab, unk_token = None))] #[pyo3(text_signature = "(vocab, unk_token)")] fn from_file( _cls: &PyType, diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 956f865b..d825482d 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -267,12 +267,12 @@ impl PyBertNormalizer { } #[new] - #[args( - clean_text = "true", - handle_chinese_chars = "true", - strip_accents = "None", - lowercase = "true" - )] + #[pyo3(signature = ( + clean_text = true, + handle_chinese_chars = true, + strip_accents = None, + lowercase = true + ))] fn new( clean_text: bool, handle_chinese_chars: bool, @@ -407,7 +407,7 @@ impl PyStrip { } #[new] - #[args(left = "true", right = "true")] + #[pyo3(signature = (left = true, right = true))] fn new(left: bool, right: bool) -> (Self, PyNormalizer) { (PyStrip {}, Strip::new(left, right).into()) } diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 71f05d7e..18af23d5 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -260,7 +260,7 @@ impl PyByteLevel { } #[new] - #[args(add_prefix_space = "true", use_regex = "true", _kwargs = "**")] + #[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs))] fn new( add_prefix_space: bool, use_regex: bool, @@ -340,7 +340,7 @@ pub struct PySplit {} #[pymethods] impl PySplit { #[new] - #[args(invert = false)] + #[pyo3(signature = (pattern, behavior, invert = false))] fn new( pattern: PyPattern, behavior: PySplitDelimiterBehavior, @@ -419,7 +419,7 @@ pub struct PyPunctuation {} #[pymethods] impl PyPunctuation { #[new] - #[args(behavior = "PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)")] + #[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)))] fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) { (PyPunctuation {}, Punctuation::new(behavior.into()).into()) } @@ -493,7 +493,7 @@ impl PyMetaspace { } #[new] - #[args(replacement = "PyChar('▁')", add_prefix_space = "true", _kwargs = "**")] + #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs))] fn new( replacement: PyChar, add_prefix_space: bool, @@ -533,7 +533,7 @@ impl PyDigits { } #[new] - #[args(individual_digits = false)] + #[pyo3(signature = (individual_digits = false))] fn new(individual_digits: bool) -> (Self, PyPreTokenizer) { (PyDigits {}, Digits::new(individual_digits).into()) } diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 442fa034..3a2cbdde 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -123,7 +123,7 @@ impl PyPostProcessor { /// /// Return: /// :class:`~tokenizers.Encoding`: The final encoding - #[args(pair = "None", add_special_tokens = "true")] + #[pyo3(signature = (encoding, pair = None, add_special_tokens = true))] #[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")] fn process( &self, @@ -201,7 +201,7 @@ pub struct PyRobertaProcessing {} #[pymethods] impl PyRobertaProcessing { #[new] - #[args(trim_offsets = true, add_prefix_space = true)] + #[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true))] fn new( sep: (String, u32), cls: (String, u32), @@ -236,7 +236,7 @@ pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { #[new] - #[args(trim_offsets = "None", _kwargs = "**")] + #[pyo3(signature = (trim_offsets = None, **_kwargs))] fn new(trim_offsets: Option, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) { let mut byte_level = ByteLevel::default(); @@ -388,7 +388,7 @@ pub struct PyTemplateProcessing {} #[pymethods] impl PyTemplateProcessing { #[new] - #[args(single = "None", pair = "None", special_tokens = "None")] + #[pyo3(signature = (single = None, pair = None, special_tokens = None))] fn new( single: Option, pair: Option, @@ -427,7 +427,7 @@ pub struct PySequence {} #[pymethods] impl PySequence { #[new] - #[args(processors)] + #[pyo3(signature = (processors_py))] fn new(processors_py: &PyList) -> (Self, PyPostProcessor) { let mut processors: Vec = Vec::with_capacity(processors_py.len()); for n in processors_py.iter() { diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index 3a5a8e60..95a954a2 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -128,7 +128,7 @@ impl From for PyAddedToken { #[pymethods] impl PyAddedToken { #[new] - #[args(kwargs = "**")] + #[pyo3(signature = (content=None, **kwargs))] fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult { let mut token = PyAddedToken::from(content.unwrap_or(""), None); @@ -308,7 +308,7 @@ impl FromPyObject<'_> for PyArrayUnicode { ); let py = ob.py(); let obj = PyObject::from_owned_ptr(py, unicode); - let s = obj.cast_as::(py)?; + let s = obj.downcast::(py)?; Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned()) }) .collect::>>()?; @@ -332,7 +332,7 @@ impl FromPyObject<'_> for PyArrayStr { .as_array() .iter() .map(|obj| { - let s = obj.cast_as::(ob.py())?; + let s = obj.downcast::(ob.py())?; Ok(s.to_string_lossy().into_owned()) }) .collect::>>()?; @@ -562,7 +562,7 @@ impl PyTokenizer { /// Returns: /// :class:`~tokenizers.Tokenizer`: The new tokenizer #[staticmethod] - #[args(revision = "String::from(\"main\")", auth_token = "None")] + #[pyo3(signature = (identifier, revision = String::from("main"), auth_token = None))] #[pyo3(text_signature = "(identifier, revision=\"main\", auth_token=None)")] fn from_pretrained( identifier: &str, @@ -591,7 +591,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`str`: A string representing the serialized Tokenizer - #[args(pretty = false)] + #[pyo3(signature = (pretty = false))] #[pyo3(text_signature = "(self, pretty=False)")] fn to_str(&self, pretty: bool) -> PyResult { ToPyResult(self.tokenizer.to_string(pretty)).into() @@ -605,7 +605,7 @@ impl PyTokenizer { /// /// pretty (:obj:`bool`, defaults to :obj:`True`): /// Whether the JSON file should be pretty formatted. - #[args(pretty = true)] + #[pyo3(signature = (path, pretty = true))] #[pyo3(text_signature = "(self, path, pretty=True)")] fn save(&self, path: &str, pretty: bool) -> PyResult<()> { ToPyResult(self.tokenizer.save(path, pretty)).into() @@ -629,7 +629,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`Dict[str, int]`: The vocabulary - #[args(with_added_tokens = true)] + #[pyo3(signature = (with_added_tokens = true))] #[pyo3(text_signature = "(self, with_added_tokens=True)")] fn get_vocab(&self, with_added_tokens: bool) -> HashMap { self.tokenizer.get_vocab(with_added_tokens) @@ -643,7 +643,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`int`: The size of the vocabulary - #[args(with_added_tokens = true)] + #[pyo3(signature = (with_added_tokens = true))] #[pyo3(text_signature = "(self, with_added_tokens=True)")] fn get_vocab_size(&self, with_added_tokens: bool) -> usize { self.tokenizer.get_vocab_size(with_added_tokens) @@ -665,7 +665,7 @@ impl PyTokenizer { /// /// direction (:obj:`str`, defaults to :obj:`right`): /// Truncate direction - #[args(kwargs = "**")] + #[pyo3(signature = (max_length, **kwargs))] #[pyo3( text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')" )] @@ -767,7 +767,7 @@ impl PyTokenizer { /// length (:obj:`int`, `optional`): /// If specified, the length at which to pad. If not specified we pad using the size of /// the longest sequence in a batch. - #[args(kwargs = "**")] + #[pyo3(signature = (**kwargs))] #[pyo3( text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)" )] @@ -896,7 +896,7 @@ impl PyTokenizer { /// Returns: /// :class:`~tokenizers.Encoding`: The encoded result /// - #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")] + #[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true))] #[pyo3( text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)" )] @@ -963,7 +963,7 @@ impl PyTokenizer { /// Returns: /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch /// - #[args(is_pretokenized = "false", add_special_tokens = "true")] + #[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))] #[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")] fn encode_batch( &self, @@ -1006,7 +1006,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`str`: The decoded string - #[args(skip_special_tokens = true)] + #[pyo3(signature = (ids, skip_special_tokens = true))] #[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")] fn decode(&self, ids: Vec, skip_special_tokens: bool) -> PyResult { ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into() @@ -1023,7 +1023,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`List[str]`: A list of decoded strings - #[args(skip_special_tokens = true)] + #[pyo3(signature = (sequences, skip_special_tokens = true))] #[pyo3(text_signature = "(self, sequences, skip_special_tokens=True)")] fn decode_batch( &self, @@ -1144,7 +1144,7 @@ impl PyTokenizer { /// /// trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`): /// An optional trainer that should be used to train our Model - #[args(trainer = "None")] + #[pyo3(signature = (files, trainer = None))] #[pyo3(text_signature = "(self, files, trainer = None)")] fn train(&mut self, files: Vec, trainer: Option<&mut PyTrainer>) -> PyResult<()> { let mut trainer = @@ -1180,7 +1180,7 @@ impl PyTokenizer { /// length (:obj:`int`, `optional`): /// The total number of sequences in the iterator. This is used to /// provide meaningful progress tracking - #[args(trainer = "None", length = "None")] + #[pyo3(signature = (iterator, trainer = None, length = None))] #[pyo3(text_signature = "(self, iterator, trainer=None, length=None)")] fn train_from_iterator( &mut self, @@ -1246,7 +1246,7 @@ impl PyTokenizer { /// /// Returns: /// :class:`~tokenizers.Encoding`: The final post-processed encoding - #[args(pair = "None", add_special_tokens = true)] + #[pyo3(signature = (encoding, pair = None, add_special_tokens = true))] #[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")] fn post_process( &self, diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index 9892c3f8..b72720f4 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -283,7 +283,7 @@ impl PyBpeTrainer { } #[new] - #[args(kwargs = "**")] + #[pyo3(signature = (**kwargs))] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::bpe::BpeTrainer::builder(); if let Some(kwargs) = kwargs { @@ -295,7 +295,7 @@ impl PyBpeTrainer { "show_progress" => builder = builder.show_progress(val.extract()?), "special_tokens" => { builder = builder.special_tokens( - val.cast_as::()? + val.downcast::()? .into_iter() .map(|token| { if let Ok(content) = token.extract::() { @@ -489,7 +489,7 @@ impl PyWordPieceTrainer { } #[new] - #[args(kwargs = "**")] + #[pyo3(signature = (** kwargs))] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordpiece::WordPieceTrainer::builder(); if let Some(kwargs) = kwargs { @@ -501,7 +501,7 @@ impl PyWordPieceTrainer { "show_progress" => builder = builder.show_progress(val.extract()?), "special_tokens" => { builder = builder.special_tokens( - val.cast_as::()? + val.downcast::()? .into_iter() .map(|token| { if let Ok(content) = token.extract::() { @@ -629,7 +629,7 @@ impl PyWordLevelTrainer { } #[new] - #[args(kwargs = "**")] + #[pyo3(signature = (**kwargs))] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::wordlevel::WordLevelTrainer::builder(); @@ -648,7 +648,7 @@ impl PyWordLevelTrainer { } "special_tokens" => { builder.special_tokens( - val.cast_as::()? + val.downcast::()? .into_iter() .map(|token| { if let Ok(content) = token.extract::() { @@ -797,7 +797,7 @@ impl PyUnigramTrainer { } #[new] - #[args(kwargs = "**")] + #[pyo3(signature = (**kwargs))] pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { let mut builder = tk::models::unigram::UnigramTrainer::builder(); if let Some(kwargs) = kwargs { @@ -821,7 +821,7 @@ impl PyUnigramTrainer { ) } "special_tokens" => builder.special_tokens( - val.cast_as::()? + val.downcast::()? .into_iter() .map(|token| { if let Ok(content) = token.extract::() { diff --git a/bindings/python/src/utils/pretokenization.rs b/bindings/python/src/utils/pretokenization.rs index fb692c77..2f8fb00b 100644 --- a/bindings/python/src/utils/pretokenization.rs +++ b/bindings/python/src/utils/pretokenization.rs @@ -223,7 +223,7 @@ impl PyPreTokenizedString { /// /// Returns: /// An Encoding - #[args(type_id = "0", word_idx = "None")] + #[pyo3(signature = (type_id = 0, word_idx = None))] #[pyo3(text_signature = "(self, type_id=0, word_idx=None)")] fn to_encoding(&self, type_id: u32, word_idx: Option) -> PyResult { to_encoding(&self.pretok, type_id, word_idx) @@ -245,10 +245,10 @@ impl PyPreTokenizedString { /// /// Returns /// A list of splits - #[args( - offset_referential = "PyOffsetReferential(OffsetReferential::Original)", - offset_type = "PyOffsetType(OffsetType::Char)" - )] + #[pyo3(signature = ( + offset_referential = PyOffsetReferential(OffsetReferential::Original), + offset_type = PyOffsetType(OffsetType::Char) + ))] #[pyo3(text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")")] fn get_splits( &self, @@ -307,17 +307,17 @@ impl PyPreTokenizedStringRefMut { .ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)? } - #[args(type_id = "0", word_idx = "None")] + #[pyo3(signature = (type_id = 0, word_idx = None))] fn to_encoding(&self, type_id: u32, word_idx: Option) -> PyResult { self.inner .map(|pretok| to_encoding(pretok, type_id, word_idx)) .ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)? } - #[args( - offset_referential = "PyOffsetReferential(OffsetReferential::Original)", - offset_type = "PyOffsetType(OffsetType::Char)" - )] + #[pyo3(signature = ( + offset_referential = PyOffsetReferential(OffsetReferential::Original), + offset_type = PyOffsetType(OffsetType::Char) + ))] fn get_splits( &self, offset_referential: PyOffsetReferential, diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py index 75011725..2057d763 100644 --- a/bindings/python/tests/test_serialization.py +++ b/bindings/python/tests/test_serialization.py @@ -65,7 +65,7 @@ class TestFullDeserialization(unittest.TestCase): # all_models.append((model_id, filename)) all_models = [("HueyNemud/das22-10-camembert_pretrained", "tokenizer.json")] - for (model_id, filename) in tqdm.tqdm(all_models): + for model_id, filename in tqdm.tqdm(all_models): tokenizer_file = cached_download(hf_hub_url(model_id, filename=filename)) is_ok = check(tokenizer_file)