pyo3: update to 0.19 (#1322)

* Bump pyo3 dependency versions

* Fix deprecation warnings from pyo3

---------

Co-authored-by: Mike Lui <mikelui@meta.com>
This commit is contained in:
Michael Lui
2023-08-16 12:40:32 -04:00
committed by GitHub
parent 9a93c50c25
commit 540bf2eb01
13 changed files with 68 additions and 89 deletions

View File

@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
serde_json = "1.0"
libc = "0.2"
env_logger = "0.7.1"
pyo3 = "0.18.1"
numpy = "0.18.0"
pyo3 = "0.19"
numpy = "0.19.0"
ndarray = "0.13"
onig = { version = "6.0", default-features = false }
itertools = "0.9"
@ -26,7 +26,7 @@ path = "../../tokenizers"
[dev-dependencies]
tempfile = "3.1"
pyo3 = { version = "0.18.1", features = ["auto-initialize"] }
pyo3 = { version = "0.19", features = ["auto-initialize"] }
[features]
default = ["pyo3/extension-module"]

View File

@ -155,12 +155,11 @@ macro_rules! setter {
/// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
/// :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteLevel")]
#[pyo3(text_signature = "(self)")]
pub struct PyByteLevelDec {}
#[pymethods]
impl PyByteLevelDec {
#[new]
#[pyo3(signature = (**_kwargs))]
#[pyo3(signature = (**_kwargs), text_signature = "(self)")]
fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) {
(PyByteLevelDec {}, ByteLevel::default().into())
}
@ -171,11 +170,11 @@ impl PyByteLevelDec {
/// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
/// :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Replace")]
#[pyo3(text_signature = "(self, pattern, content)")]
pub struct PyReplaceDec {}
#[pymethods]
impl PyReplaceDec {
#[new]
#[pyo3(text_signature = "(self, pattern, content)")]
fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyDecoder)> {
Ok((
PyReplaceDec {},
@ -194,7 +193,6 @@ impl PyReplaceDec {
/// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
/// and some abbreviated english forms.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "WordPiece")]
#[pyo3(text_signature = "(self, prefix=\"##\", cleanup=True)")]
pub struct PyWordPieceDec {}
#[pymethods]
impl PyWordPieceDec {
@ -219,7 +217,7 @@ impl PyWordPieceDec {
}
#[new]
#[pyo3(signature = (prefix = String::from("##"), cleanup = true))]
#[pyo3(signature = (prefix = String::from("##"), cleanup = true), text_signature = "(self, prefix=\"##\", cleanup=True)")]
fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) {
(PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into())
}
@ -231,12 +229,11 @@ impl PyWordPieceDec {
/// cannot be decoded you will get <20> instead for each inconvertable byte token
///
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
#[pyo3(text_signature = "(self)")]
pub struct PyByteFallbackDec {}
#[pymethods]
impl PyByteFallbackDec {
#[new]
#[pyo3(signature = ())]
#[pyo3(signature = (), text_signature = "(self)")]
fn new() -> (Self, PyDecoder) {
(PyByteFallbackDec {}, ByteFallback::new().into())
}
@ -247,12 +244,11 @@ impl PyByteFallbackDec {
/// This is the last step of decoding, this decoder exists only if
/// there is need to add other decoders *after* the fusion
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Fuse")]
#[pyo3(text_signature = "(self)")]
pub struct PyFuseDec {}
#[pymethods]
impl PyFuseDec {
#[new]
#[pyo3(signature = ())]
#[pyo3(signature = (), text_signature = "(self)")]
fn new() -> (Self, PyDecoder) {
(PyFuseDec {}, Fuse::new().into())
}
@ -261,7 +257,6 @@ impl PyFuseDec {
/// Strip normalizer
/// Strips n left characters of each token, or n right characters of each token
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Strip")]
#[pyo3(text_signature = "(self, content, left=0, right=0)")]
pub struct PyStrip {}
#[pymethods]
impl PyStrip {
@ -296,7 +291,7 @@ impl PyStrip {
}
#[new]
#[pyo3(signature = (content=' ', left=0, right=0))]
#[pyo3(signature = (content=' ', left=0, right=0), text_signature = "(self, content, left=0, right=0)")]
fn new(content: char, left: usize, right: usize) -> (Self, PyDecoder) {
(PyStrip {}, Strip::new(content, left, right).into())
}
@ -313,7 +308,6 @@ impl PyStrip {
/// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")]
#[pyo3(text_signature = "(self, replacement = \"\", add_prefix_space = True)")]
pub struct PyMetaspaceDec {}
#[pymethods]
impl PyMetaspaceDec {
@ -338,7 +332,7 @@ impl PyMetaspaceDec {
}
#[new]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true))]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true), text_signature = "(self, replacement = \"\", add_prefix_space = True)")]
fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
(
PyMetaspaceDec {},
@ -354,7 +348,6 @@ impl PyMetaspaceDec {
/// The suffix that was used to caracterize an end-of-word. This suffix will
/// be replaced by whitespaces during the decoding
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
#[pyo3(text_signature = "(self, suffix=\"</w>\")")]
pub struct PyBPEDecoder {}
#[pymethods]
impl PyBPEDecoder {
@ -369,7 +362,7 @@ impl PyBPEDecoder {
}
#[new]
#[pyo3(signature = (suffix = String::from("</w>")))]
#[pyo3(signature = (suffix = String::from("</w>")), text_signature = "(self, suffix=\"</w>\")")]
fn new(suffix: String) -> (Self, PyDecoder) {
(PyBPEDecoder {}, BPEDecoder::new(suffix).into())
}
@ -386,7 +379,6 @@ impl PyBPEDecoder {
/// Whether to cleanup some tokenization artifacts.
/// Mainly spaces before punctuation, and some abbreviated english forms.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "CTC")]
#[pyo3(text_signature = "(self, pad_token=\"<pad>\", word_delimiter_token=\"|\", cleanup=True)")]
pub struct PyCTCDecoder {}
#[pymethods]
impl PyCTCDecoder {
@ -425,7 +417,8 @@ impl PyCTCDecoder {
pad_token = String::from("<pad>"),
word_delimiter_token = String::from("|"),
cleanup = true
))]
),
text_signature = "(self, pad_token=\"<pad>\", word_delimiter_token=\"|\", cleanup=True)")]
fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) {
(
PyCTCDecoder {},
@ -440,12 +433,11 @@ impl PyCTCDecoder {
/// decoders (:obj:`List[Decoder]`)
/// The decoders that need to be chained
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name="Sequence")]
#[pyo3(text_signature = "(self, decoders)")]
pub struct PySequenceDecoder {}
#[pymethods]
impl PySequenceDecoder {
#[new]
#[pyo3(signature = (decoders_py))]
#[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")]
fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
for decoder_py in decoders_py.iter() {

View File

@ -23,6 +23,7 @@ impl From<tk::tokenizer::Encoding> for PyEncoding {
#[pymethods]
impl PyEncoding {
#[new]
#[pyo3(text_signature = None)]
fn new() -> Self {
Self {
encoding: tk::tokenizer::Encoding::default(),

View File

@ -89,6 +89,7 @@ where
#[pymethods]
impl PyModel {
#[new]
#[pyo3(text_signature = None)]
fn __new__() -> Self {
// Instantiate a default empty model. This doesn't really make sense, but we need
// to be able to instantiate an empty model for pickle capabilities.
@ -253,9 +254,6 @@ impl PyModel {
/// byte_fallback (:obj:`bool`, `optional`):
/// Whether to use spm byte-fallback trick (defaults to False)
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")]
#[pyo3(
text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)"
)]
pub struct PyBPE {}
impl PyBPE {
@ -400,7 +398,9 @@ impl PyBPE {
}
#[new]
#[pyo3(signature = (vocab=None, merges=None, **kwargs))]
#[pyo3(
signature = (vocab=None, merges=None, **kwargs),
text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)")]
fn new(
py: Python<'_>,
vocab: Option<PyVocab>,
@ -523,7 +523,6 @@ impl PyBPE {
/// max_input_chars_per_word (:obj:`int`, `optional`):
/// The maximum number of characters to authorize in a single word.
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordPiece")]
#[pyo3(text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
pub struct PyWordPiece {}
impl PyWordPiece {
@ -597,7 +596,7 @@ impl PyWordPiece {
}
#[new]
#[pyo3(signature = (vocab=None, **kwargs))]
#[pyo3(signature = (vocab=None, **kwargs), text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
fn new(
py: Python<'_>,
vocab: Option<PyVocab>,
@ -692,7 +691,6 @@ impl PyWordPiece {
/// unk_token (:obj:`str`, `optional`):
/// The unknown token to be used by the model.
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordLevel")]
#[pyo3(text_signature = "(self, vocab, unk_token)")]
pub struct PyWordLevel {}
#[pymethods]
@ -708,7 +706,7 @@ impl PyWordLevel {
}
#[new]
#[pyo3(signature = (vocab=None, unk_token = None))]
#[pyo3(signature = (vocab=None, unk_token = None), text_signature = "(self, vocab, unk_token)")]
fn new(
py: Python<'_>,
vocab: Option<PyVocab>,
@ -807,12 +805,12 @@ impl PyWordLevel {
/// vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
/// A list of vocabulary items and their relative score [("am", -0.2442),...]
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")]
#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
pub struct PyUnigram {}
#[pymethods]
impl PyUnigram {
#[new]
#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
fn new(
vocab: Option<Vec<(String, f64)>>,
unk_id: Option<usize>,

View File

@ -217,9 +217,6 @@ macro_rules! setter {
/// lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
/// Whether to lowercase.
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")]
#[pyo3(
text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"
)]
pub struct PyBertNormalizer {}
#[pymethods]
impl PyBertNormalizer {
@ -274,7 +271,8 @@ impl PyBertNormalizer {
handle_chinese_chars = true,
strip_accents = None,
lowercase = true
))]
),
text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)")]
fn new(
clean_text: bool,
handle_chinese_chars: bool,
@ -289,11 +287,11 @@ impl PyBertNormalizer {
/// NFD Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFD")]
#[pyo3(text_signature = "(self)")]
pub struct PyNFD {}
#[pymethods]
impl PyNFD {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) {
(PyNFD {}, PyNormalizer::new(NFD.into()))
}
@ -301,11 +299,11 @@ impl PyNFD {
/// NFKD Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKD")]
#[pyo3(text_signature = "(self)")]
pub struct PyNFKD {}
#[pymethods]
impl PyNFKD {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) {
(PyNFKD {}, NFKD.into())
}
@ -313,11 +311,11 @@ impl PyNFKD {
/// NFC Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFC")]
#[pyo3(text_signature = "(self)")]
pub struct PyNFC {}
#[pymethods]
impl PyNFC {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) {
(PyNFC {}, NFC.into())
}
@ -325,11 +323,11 @@ impl PyNFC {
/// NFKC Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKC")]
#[pyo3(text_signature = "(self)")]
pub struct PyNFKC {}
#[pymethods]
impl PyNFKC {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) {
(PyNFKC {}, NFKC.into())
}
@ -346,6 +344,7 @@ pub struct PySequence {}
#[pymethods]
impl PySequence {
#[new]
#[pyo3(text_signature = None)]
fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> {
let mut sequence = Vec::with_capacity(normalizers.len());
for n in normalizers.iter() {
@ -372,11 +371,11 @@ impl PySequence {
/// Lowercase Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Lowercase")]
#[pyo3(text_signature = "(self)")]
pub struct PyLowercase {}
#[pymethods]
impl PyLowercase {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) {
(PyLowercase {}, Lowercase.into())
}
@ -384,7 +383,6 @@ impl PyLowercase {
/// Strip normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Strip")]
#[pyo3(text_signature = "(self, left=True, right=True)")]
pub struct PyStrip {}
#[pymethods]
impl PyStrip {
@ -409,7 +407,7 @@ impl PyStrip {
}
#[new]
#[pyo3(signature = (left = true, right = true))]
#[pyo3(signature = (left = true, right = true), text_signature = "(self, left=True, right=True)")]
fn new(left: bool, right: bool) -> (Self, PyNormalizer) {
(PyStrip {}, Strip::new(left, right).into())
}
@ -417,7 +415,6 @@ impl PyStrip {
/// Prepend normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")]
#[pyo3(text_signature = "(self, prepend)")]
pub struct PyPrepend {}
#[pymethods]
impl PyPrepend {
@ -432,7 +429,7 @@ impl PyPrepend {
}
#[new]
#[pyo3(signature = (prepend="".to_string()))]
#[pyo3(signature = (prepend="".to_string()), text_signature = "(self, prepend)")]
fn new(prepend: String) -> (Self, PyNormalizer) {
(PyPrepend {}, Prepend::new(prepend).into())
}
@ -440,11 +437,11 @@ impl PyPrepend {
/// StripAccents normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")]
#[pyo3(text_signature = "(self)")]
pub struct PyStripAccents {}
#[pymethods]
impl PyStripAccents {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) {
(PyStripAccents {}, StripAccents.into())
}
@ -452,11 +449,11 @@ impl PyStripAccents {
/// Nmt normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Nmt")]
#[pyo3(text_signature = "(self)")]
pub struct PyNmt {}
#[pymethods]
impl PyNmt {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) {
(PyNmt {}, Nmt.into())
}
@ -465,11 +462,11 @@ impl PyNmt {
/// Precompiled normalizer
/// Don't use manually it is used for compatiblity for SentencePiece.
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
pub struct PyPrecompiled {}
#[pymethods]
impl PyPrecompiled {
#[new]
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> {
let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?;
Ok((
@ -488,11 +485,11 @@ impl PyPrecompiled {
/// Replace normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Replace")]
#[pyo3(text_signature = "(self, pattern, content)")]
pub struct PyReplace {}
#[pymethods]
impl PyReplace {
#[new]
#[pyo3(text_signature = "(self, pattern, content)")]
fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyNormalizer)> {
Ok((
PyReplace {},

View File

@ -235,7 +235,6 @@ macro_rules! setter {
/// Set this to :obj:`False` to prevent this `pre_tokenizer` from using
/// the GPT2 specific regexp for spliting on whitespace.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")]
#[pyo3(text_signature = "(self, add_prefix_space=True, use_regex=True)")]
pub struct PyByteLevel {}
#[pymethods]
impl PyByteLevel {
@ -260,7 +259,7 @@ impl PyByteLevel {
}
#[new]
#[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs))]
#[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs), text_signature = "(self, add_prefix_space=True, use_regex=True)")]
fn new(
add_prefix_space: bool,
use_regex: bool,
@ -295,11 +294,11 @@ impl PyByteLevel {
/// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")]
#[pyo3(text_signature = "(self)")]
pub struct PyWhitespace {}
#[pymethods]
impl PyWhitespace {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyPreTokenizer) {
(PyWhitespace {}, Whitespace {}.into())
}
@ -307,11 +306,11 @@ impl PyWhitespace {
/// This pre-tokenizer simply splits on the whitespace. Works like `.split()`
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "WhitespaceSplit")]
#[pyo3(text_signature = "(self)")]
pub struct PyWhitespaceSplit {}
#[pymethods]
impl PyWhitespaceSplit {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyPreTokenizer) {
(PyWhitespaceSplit {}, WhitespaceSplit.into())
}
@ -335,12 +334,11 @@ impl PyWhitespaceSplit {
/// invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
/// Whether to invert the pattern.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Split")]
#[pyo3(text_signature = "(self, pattern, behavior, invert=False)")]
pub struct PySplit {}
#[pymethods]
impl PySplit {
#[new]
#[pyo3(signature = (pattern, behavior, invert = false))]
#[pyo3(signature = (pattern, behavior, invert = false), text_signature = "(self, pattern, behavior, invert=False)")]
fn new(
pattern: PyPattern,
behavior: PySplitDelimiterBehavior,
@ -379,6 +377,7 @@ impl PyCharDelimiterSplit {
}
#[new]
#[pyo3(text_signature = None)]
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
Ok((
PyCharDelimiterSplit {},
@ -396,11 +395,11 @@ impl PyCharDelimiterSplit {
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
/// Each occurence of a punctuation character will be treated separately.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
#[pyo3(text_signature = "(self)")]
pub struct PyBertPreTokenizer {}
#[pymethods]
impl PyBertPreTokenizer {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyPreTokenizer) {
(PyBertPreTokenizer {}, BertPreTokenizer.into())
}
@ -414,12 +413,11 @@ impl PyBertPreTokenizer {
/// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
/// "contiguous"
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Punctuation")]
#[pyo3(text_signature = "(self, behavior=\"isolated\")")]
pub struct PyPunctuation {}
#[pymethods]
impl PyPunctuation {
#[new]
#[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)))]
#[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)), text_signature = "(self, behavior=\"isolated\")")]
fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
(PyPunctuation {}, Punctuation::new(behavior.into()).into())
}
@ -427,11 +425,11 @@ impl PyPunctuation {
/// This pre-tokenizer composes other pre_tokenizers and applies them in sequence
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")]
#[pyo3(text_signature = "(self, pretokenizers)")]
pub struct PySequence {}
#[pymethods]
impl PySequence {
#[new]
#[pyo3(text_signature = "(self, pretokenizers)")]
fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> {
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
for n in pre_tokenizers.iter() {
@ -468,7 +466,6 @@ impl PySequence {
/// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")]
#[pyo3(text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
pub struct PyMetaspace {}
#[pymethods]
impl PyMetaspace {
@ -493,7 +490,7 @@ impl PyMetaspace {
}
#[new]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs))]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
fn new(
replacement: PyChar,
add_prefix_space: bool,
@ -518,7 +515,6 @@ impl PyMetaspace {
///
/// "Call 123 please" -> "Call ", "123", " please"
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Digits")]
#[pyo3(text_signature = "(self, individual_digits=False)")]
pub struct PyDigits {}
#[pymethods]
impl PyDigits {
@ -533,7 +529,7 @@ impl PyDigits {
}
#[new]
#[pyo3(signature = (individual_digits = false))]
#[pyo3(signature = (individual_digits = false), text_signature = "(self, individual_digits=False)")]
fn new(individual_digits: bool) -> (Self, PyPreTokenizer) {
(PyDigits {}, Digits::new(individual_digits).into())
}
@ -544,11 +540,11 @@ impl PyDigits {
/// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
/// This mimicks SentencePiece Unigram implementation.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "UnicodeScripts")]
#[pyo3(text_signature = "(self)")]
pub struct PyUnicodeScripts {}
#[pymethods]
impl PyUnicodeScripts {
#[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyPreTokenizer) {
(PyUnicodeScripts {}, UnicodeScripts::new().into())
}

View File

@ -154,11 +154,11 @@ impl PyPostProcessor {
/// cls (:obj:`Tuple[str, int]`):
/// A tuple with the string representation of the CLS token, and its id
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "BertProcessing")]
#[pyo3(text_signature = "(self, sep, cls)")]
pub struct PyBertProcessing {}
#[pymethods]
impl PyBertProcessing {
#[new]
#[pyo3(text_signature = "(self, sep, cls)")]
fn new(sep: (String, u32), cls: (String, u32)) -> (Self, PyPostProcessor) {
(
PyBertProcessing {},
@ -196,12 +196,11 @@ impl PyBertProcessing {
/// Whether the add_prefix_space option was enabled during pre-tokenization. This
/// is relevant because it defines the way the offsets are trimmed out.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "RobertaProcessing")]
#[pyo3(text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")]
pub struct PyRobertaProcessing {}
#[pymethods]
impl PyRobertaProcessing {
#[new]
#[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true))]
#[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true), text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")]
fn new(
sep: (String, u32),
cls: (String, u32),
@ -231,12 +230,11 @@ impl PyRobertaProcessing {
/// trim_offsets (:obj:`bool`):
/// Whether to trim the whitespaces from the produced offsets.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
#[pyo3(text_signature = "(self, trim_offsets=True)")]
pub struct PyByteLevel {}
#[pymethods]
impl PyByteLevel {
#[new]
#[pyo3(signature = (trim_offsets = None, **_kwargs))]
#[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
let mut byte_level = ByteLevel::default();
@ -383,12 +381,11 @@ impl FromPyObject<'_> for PyTemplate {
/// The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
/// the same length.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "TemplateProcessing")]
#[pyo3(text_signature = "(self, single, pair, special_tokens)")]
pub struct PyTemplateProcessing {}
#[pymethods]
impl PyTemplateProcessing {
#[new]
#[pyo3(signature = (single = None, pair = None, special_tokens = None))]
#[pyo3(signature = (single = None, pair = None, special_tokens = None), text_signature = "(self, single, pair, special_tokens)")]
fn new(
single: Option<PyTemplate>,
pair: Option<PyTemplate>,
@ -422,12 +419,11 @@ impl PyTemplateProcessing {
/// processors (:obj:`List[PostProcessor]`)
/// The processors that need to be chained
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "Sequence")]
#[pyo3(text_signature = "(self, processors)")]
pub struct PySequence {}
#[pymethods]
impl PySequence {
#[new]
#[pyo3(signature = (processors_py))]
#[pyo3(signature = (processors_py), text_signature = "(self, processors)")]
fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
for n in processors_py.iter() {

View File

@ -20,6 +20,7 @@ impl From<PyToken> for Token {
#[pymethods]
impl PyToken {
#[new]
#[pyo3(text_signature = None)]
fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken {
Token::new(id, value, offsets).into()
}

View File

@ -56,9 +56,6 @@ use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
/// Yesterday"``.
///
#[pyclass(dict, module = "tokenizers", name = "AddedToken")]
#[pyo3(
text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"
)]
pub struct PyAddedToken {
pub content: String,
pub is_special_token: bool,
@ -128,7 +125,7 @@ impl From<tk::AddedToken> for PyAddedToken {
#[pymethods]
impl PyAddedToken {
#[new]
#[pyo3(signature = (content=None, **kwargs))]
#[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")]
fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
let mut token = PyAddedToken::from(content.unwrap_or(""), None);
@ -441,7 +438,6 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
/// The core algorithm that this :obj:`Tokenizer` should be using.
///
#[pyclass(dict, module = "tokenizers", name = "Tokenizer")]
#[pyo3(text_signature = "(self, model)")]
#[derive(Clone)]
pub struct PyTokenizer {
tokenizer: Tokenizer,
@ -460,6 +456,7 @@ impl PyTokenizer {
#[pymethods]
impl PyTokenizer {
#[new]
#[pyo3(text_signature = "(self, model)")]
fn __new__(model: PyRef<PyModel>) -> Self {
PyTokenizer::from_model(model.clone())
}

View File

@ -299,7 +299,7 @@ impl PyBpeTrainer {
}
#[new]
#[pyo3(signature = (**kwargs))]
#[pyo3(signature = (**kwargs), text_signature = None)]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut builder = tk::models::bpe::BpeTrainer::builder();
if let Some(kwargs) = kwargs {
@ -383,9 +383,6 @@ impl PyBpeTrainer {
/// end_of_word_suffix (:obj:`str`, `optional`):
/// A suffix to be used for every subword that is a end-of-word.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")]
#[pyo3(
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
)]
pub struct PyWordPieceTrainer {}
#[pymethods]
impl PyWordPieceTrainer {
@ -506,7 +503,10 @@ impl PyWordPieceTrainer {
}
#[new]
#[pyo3(signature = (** kwargs))]
#[pyo3(
signature = (** kwargs),
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
)]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
if let Some(kwargs) = kwargs {
@ -646,7 +646,7 @@ impl PyWordLevelTrainer {
}
#[new]
#[pyo3(signature = (**kwargs))]
#[pyo3(signature = (**kwargs), text_signature = None)]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
@ -731,9 +731,6 @@ impl PyWordLevelTrainer {
/// The number of iterations of the EM algorithm to perform before
/// pruning the vocabulary.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")]
#[pyo3(
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
)]
pub struct PyUnigramTrainer {}
#[pymethods]
impl PyUnigramTrainer {
@ -814,7 +811,10 @@ impl PyUnigramTrainer {
}
#[new]
#[pyo3(signature = (**kwargs))]
#[pyo3(
signature = (**kwargs),
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
)]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut builder = tk::models::unigram::UnigramTrainer::builder();
if let Some(kwargs) = kwargs {

View File

@ -200,6 +200,7 @@ pub struct PyNormalizedString {
#[pymethods]
impl PyNormalizedString {
#[new]
#[pyo3(text_signature = None)]
fn new(s: &str) -> Self {
NormalizedString::from(s).into()
}

View File

@ -148,7 +148,6 @@ fn to_encoding(
/// sequence: str:
/// The string sequence used to initialize this PreTokenizedString
#[pyclass(module = "tokenizers", name = "PreTokenizedString")]
#[pyo3(text_signature = "(self, sequence)")]
pub struct PyPreTokenizedString {
pub(crate) pretok: tk::PreTokenizedString,
}
@ -168,6 +167,7 @@ impl From<PyPreTokenizedString> for PreTokenizedString {
#[pymethods]
impl PyPreTokenizedString {
#[new]
#[pyo3(text_signature = "(self, sequence)")]
fn new(s: &str) -> Self {
PreTokenizedString::from(s).into()
}

View File

@ -4,7 +4,6 @@ use pyo3::prelude::*;
/// Instantiate a new Regex with the given pattern
#[pyclass(module = "tokenizers", name = "Regex")]
#[pyo3(text_signature = "(self, pattern)")]
pub struct PyRegex {
pub inner: Regex,
pub pattern: String,
@ -13,6 +12,7 @@ pub struct PyRegex {
#[pymethods]
impl PyRegex {
#[new]
#[pyo3(text_signature = "(self, pattern)")]
fn new(s: &str) -> PyResult<Self> {
Ok(Self {
inner: Regex::new(s)