pyo3: update to 0.19 (#1322)

* Bump pyo3 dependency versions

* Fix deprecation warnings from pyo3

---------

Co-authored-by: Mike Lui <mikelui@meta.com>
This commit is contained in:
Michael Lui
2023-08-16 12:40:32 -04:00
committed by GitHub
parent 9a93c50c25
commit 540bf2eb01
13 changed files with 68 additions and 89 deletions

View File

@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
serde_json = "1.0" serde_json = "1.0"
libc = "0.2" libc = "0.2"
env_logger = "0.7.1" env_logger = "0.7.1"
pyo3 = "0.18.1" pyo3 = "0.19"
numpy = "0.18.0" numpy = "0.19.0"
ndarray = "0.13" ndarray = "0.13"
onig = { version = "6.0", default-features = false } onig = { version = "6.0", default-features = false }
itertools = "0.9" itertools = "0.9"
@ -26,7 +26,7 @@ path = "../../tokenizers"
[dev-dependencies] [dev-dependencies]
tempfile = "3.1" tempfile = "3.1"
pyo3 = { version = "0.18.1", features = ["auto-initialize"] } pyo3 = { version = "0.19", features = ["auto-initialize"] }
[features] [features]
default = ["pyo3/extension-module"] default = ["pyo3/extension-module"]

View File

@ -155,12 +155,11 @@ macro_rules! setter {
/// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel` /// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
/// :class:`~tokenizers.pre_tokenizers.PreTokenizer`. /// :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteLevel")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteLevel")]
#[pyo3(text_signature = "(self)")]
pub struct PyByteLevelDec {} pub struct PyByteLevelDec {}
#[pymethods] #[pymethods]
impl PyByteLevelDec { impl PyByteLevelDec {
#[new] #[new]
#[pyo3(signature = (**_kwargs))] #[pyo3(signature = (**_kwargs), text_signature = "(self)")]
fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) { fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) {
(PyByteLevelDec {}, ByteLevel::default().into()) (PyByteLevelDec {}, ByteLevel::default().into())
} }
@ -171,11 +170,11 @@ impl PyByteLevelDec {
/// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace` /// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
/// :class:`~tokenizers.pre_tokenizers.PreTokenizer`. /// :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Replace")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Replace")]
#[pyo3(text_signature = "(self, pattern, content)")]
pub struct PyReplaceDec {} pub struct PyReplaceDec {}
#[pymethods] #[pymethods]
impl PyReplaceDec { impl PyReplaceDec {
#[new] #[new]
#[pyo3(text_signature = "(self, pattern, content)")]
fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyDecoder)> { fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyDecoder)> {
Ok(( Ok((
PyReplaceDec {}, PyReplaceDec {},
@ -194,7 +193,6 @@ impl PyReplaceDec {
/// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, /// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
/// and some abbreviated english forms. /// and some abbreviated english forms.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "WordPiece")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "WordPiece")]
#[pyo3(text_signature = "(self, prefix=\"##\", cleanup=True)")]
pub struct PyWordPieceDec {} pub struct PyWordPieceDec {}
#[pymethods] #[pymethods]
impl PyWordPieceDec { impl PyWordPieceDec {
@ -219,7 +217,7 @@ impl PyWordPieceDec {
} }
#[new] #[new]
#[pyo3(signature = (prefix = String::from("##"), cleanup = true))] #[pyo3(signature = (prefix = String::from("##"), cleanup = true), text_signature = "(self, prefix=\"##\", cleanup=True)")]
fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) { fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) {
(PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into()) (PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into())
} }
@ -231,12 +229,11 @@ impl PyWordPieceDec {
/// cannot be decoded you will get <20> instead for each inconvertable byte token /// cannot be decoded you will get <20> instead for each inconvertable byte token
/// ///
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
#[pyo3(text_signature = "(self)")]
pub struct PyByteFallbackDec {} pub struct PyByteFallbackDec {}
#[pymethods] #[pymethods]
impl PyByteFallbackDec { impl PyByteFallbackDec {
#[new] #[new]
#[pyo3(signature = ())] #[pyo3(signature = (), text_signature = "(self)")]
fn new() -> (Self, PyDecoder) { fn new() -> (Self, PyDecoder) {
(PyByteFallbackDec {}, ByteFallback::new().into()) (PyByteFallbackDec {}, ByteFallback::new().into())
} }
@ -247,12 +244,11 @@ impl PyByteFallbackDec {
/// This is the last step of decoding, this decoder exists only if /// This is the last step of decoding, this decoder exists only if
/// there is need to add other decoders *after* the fusion /// there is need to add other decoders *after* the fusion
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Fuse")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Fuse")]
#[pyo3(text_signature = "(self)")]
pub struct PyFuseDec {} pub struct PyFuseDec {}
#[pymethods] #[pymethods]
impl PyFuseDec { impl PyFuseDec {
#[new] #[new]
#[pyo3(signature = ())] #[pyo3(signature = (), text_signature = "(self)")]
fn new() -> (Self, PyDecoder) { fn new() -> (Self, PyDecoder) {
(PyFuseDec {}, Fuse::new().into()) (PyFuseDec {}, Fuse::new().into())
} }
@ -261,7 +257,6 @@ impl PyFuseDec {
/// Strip normalizer /// Strip normalizer
/// Strips n left characters of each token, or n right characters of each token /// Strips n left characters of each token, or n right characters of each token
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Strip")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Strip")]
#[pyo3(text_signature = "(self, content, left=0, right=0)")]
pub struct PyStrip {} pub struct PyStrip {}
#[pymethods] #[pymethods]
impl PyStrip { impl PyStrip {
@ -296,7 +291,7 @@ impl PyStrip {
} }
#[new] #[new]
#[pyo3(signature = (content=' ', left=0, right=0))] #[pyo3(signature = (content=' ', left=0, right=0), text_signature = "(self, content, left=0, right=0)")]
fn new(content: char, left: usize, right: usize) -> (Self, PyDecoder) { fn new(content: char, left: usize, right: usize) -> (Self, PyDecoder) {
(PyStrip {}, Strip::new(content, left, right).into()) (PyStrip {}, Strip::new(content, left, right).into())
} }
@ -313,7 +308,6 @@ impl PyStrip {
/// Whether to add a space to the first word if there isn't already one. This /// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`. /// lets us treat `hello` exactly like `say hello`.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")]
#[pyo3(text_signature = "(self, replacement = \"\", add_prefix_space = True)")]
pub struct PyMetaspaceDec {} pub struct PyMetaspaceDec {}
#[pymethods] #[pymethods]
impl PyMetaspaceDec { impl PyMetaspaceDec {
@ -338,7 +332,7 @@ impl PyMetaspaceDec {
} }
#[new] #[new]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true))] #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true), text_signature = "(self, replacement = \"\", add_prefix_space = True)")]
fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) { fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
( (
PyMetaspaceDec {}, PyMetaspaceDec {},
@ -354,7 +348,6 @@ impl PyMetaspaceDec {
/// The suffix that was used to caracterize an end-of-word. This suffix will /// The suffix that was used to caracterize an end-of-word. This suffix will
/// be replaced by whitespaces during the decoding /// be replaced by whitespaces during the decoding
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
#[pyo3(text_signature = "(self, suffix=\"</w>\")")]
pub struct PyBPEDecoder {} pub struct PyBPEDecoder {}
#[pymethods] #[pymethods]
impl PyBPEDecoder { impl PyBPEDecoder {
@ -369,7 +362,7 @@ impl PyBPEDecoder {
} }
#[new] #[new]
#[pyo3(signature = (suffix = String::from("</w>")))] #[pyo3(signature = (suffix = String::from("</w>")), text_signature = "(self, suffix=\"</w>\")")]
fn new(suffix: String) -> (Self, PyDecoder) { fn new(suffix: String) -> (Self, PyDecoder) {
(PyBPEDecoder {}, BPEDecoder::new(suffix).into()) (PyBPEDecoder {}, BPEDecoder::new(suffix).into())
} }
@ -386,7 +379,6 @@ impl PyBPEDecoder {
/// Whether to cleanup some tokenization artifacts. /// Whether to cleanup some tokenization artifacts.
/// Mainly spaces before punctuation, and some abbreviated english forms. /// Mainly spaces before punctuation, and some abbreviated english forms.
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "CTC")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "CTC")]
#[pyo3(text_signature = "(self, pad_token=\"<pad>\", word_delimiter_token=\"|\", cleanup=True)")]
pub struct PyCTCDecoder {} pub struct PyCTCDecoder {}
#[pymethods] #[pymethods]
impl PyCTCDecoder { impl PyCTCDecoder {
@ -425,7 +417,8 @@ impl PyCTCDecoder {
pad_token = String::from("<pad>"), pad_token = String::from("<pad>"),
word_delimiter_token = String::from("|"), word_delimiter_token = String::from("|"),
cleanup = true cleanup = true
))] ),
text_signature = "(self, pad_token=\"<pad>\", word_delimiter_token=\"|\", cleanup=True)")]
fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) { fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) {
( (
PyCTCDecoder {}, PyCTCDecoder {},
@ -440,12 +433,11 @@ impl PyCTCDecoder {
/// decoders (:obj:`List[Decoder]`) /// decoders (:obj:`List[Decoder]`)
/// The decoders that need to be chained /// The decoders that need to be chained
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name="Sequence")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name="Sequence")]
#[pyo3(text_signature = "(self, decoders)")]
pub struct PySequenceDecoder {} pub struct PySequenceDecoder {}
#[pymethods] #[pymethods]
impl PySequenceDecoder { impl PySequenceDecoder {
#[new] #[new]
#[pyo3(signature = (decoders_py))] #[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")]
fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> { fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len()); let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
for decoder_py in decoders_py.iter() { for decoder_py in decoders_py.iter() {

View File

@ -23,6 +23,7 @@ impl From<tk::tokenizer::Encoding> for PyEncoding {
#[pymethods] #[pymethods]
impl PyEncoding { impl PyEncoding {
#[new] #[new]
#[pyo3(text_signature = None)]
fn new() -> Self { fn new() -> Self {
Self { Self {
encoding: tk::tokenizer::Encoding::default(), encoding: tk::tokenizer::Encoding::default(),

View File

@ -89,6 +89,7 @@ where
#[pymethods] #[pymethods]
impl PyModel { impl PyModel {
#[new] #[new]
#[pyo3(text_signature = None)]
fn __new__() -> Self { fn __new__() -> Self {
// Instantiate a default empty model. This doesn't really make sense, but we need // Instantiate a default empty model. This doesn't really make sense, but we need
// to be able to instantiate an empty model for pickle capabilities. // to be able to instantiate an empty model for pickle capabilities.
@ -253,9 +254,6 @@ impl PyModel {
/// byte_fallback (:obj:`bool`, `optional`): /// byte_fallback (:obj:`bool`, `optional`):
/// Whether to use spm byte-fallback trick (defaults to False) /// Whether to use spm byte-fallback trick (defaults to False)
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")] #[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")]
#[pyo3(
text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)"
)]
pub struct PyBPE {} pub struct PyBPE {}
impl PyBPE { impl PyBPE {
@ -400,7 +398,9 @@ impl PyBPE {
} }
#[new] #[new]
#[pyo3(signature = (vocab=None, merges=None, **kwargs))] #[pyo3(
signature = (vocab=None, merges=None, **kwargs),
text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)")]
fn new( fn new(
py: Python<'_>, py: Python<'_>,
vocab: Option<PyVocab>, vocab: Option<PyVocab>,
@ -523,7 +523,6 @@ impl PyBPE {
/// max_input_chars_per_word (:obj:`int`, `optional`): /// max_input_chars_per_word (:obj:`int`, `optional`):
/// The maximum number of characters to authorize in a single word. /// The maximum number of characters to authorize in a single word.
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordPiece")] #[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordPiece")]
#[pyo3(text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
pub struct PyWordPiece {} pub struct PyWordPiece {}
impl PyWordPiece { impl PyWordPiece {
@ -597,7 +596,7 @@ impl PyWordPiece {
} }
#[new] #[new]
#[pyo3(signature = (vocab=None, **kwargs))] #[pyo3(signature = (vocab=None, **kwargs), text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
fn new( fn new(
py: Python<'_>, py: Python<'_>,
vocab: Option<PyVocab>, vocab: Option<PyVocab>,
@ -692,7 +691,6 @@ impl PyWordPiece {
/// unk_token (:obj:`str`, `optional`): /// unk_token (:obj:`str`, `optional`):
/// The unknown token to be used by the model. /// The unknown token to be used by the model.
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordLevel")] #[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordLevel")]
#[pyo3(text_signature = "(self, vocab, unk_token)")]
pub struct PyWordLevel {} pub struct PyWordLevel {}
#[pymethods] #[pymethods]
@ -708,7 +706,7 @@ impl PyWordLevel {
} }
#[new] #[new]
#[pyo3(signature = (vocab=None, unk_token = None))] #[pyo3(signature = (vocab=None, unk_token = None), text_signature = "(self, vocab, unk_token)")]
fn new( fn new(
py: Python<'_>, py: Python<'_>,
vocab: Option<PyVocab>, vocab: Option<PyVocab>,
@ -807,12 +805,12 @@ impl PyWordLevel {
/// vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`): /// vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
/// A list of vocabulary items and their relative score [("am", -0.2442),...] /// A list of vocabulary items and their relative score [("am", -0.2442),...]
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")] #[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")]
#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
pub struct PyUnigram {} pub struct PyUnigram {}
#[pymethods] #[pymethods]
impl PyUnigram { impl PyUnigram {
#[new] #[new]
#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
fn new( fn new(
vocab: Option<Vec<(String, f64)>>, vocab: Option<Vec<(String, f64)>>,
unk_id: Option<usize>, unk_id: Option<usize>,

View File

@ -217,9 +217,6 @@ macro_rules! setter {
/// lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`): /// lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
/// Whether to lowercase. /// Whether to lowercase.
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")]
#[pyo3(
text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"
)]
pub struct PyBertNormalizer {} pub struct PyBertNormalizer {}
#[pymethods] #[pymethods]
impl PyBertNormalizer { impl PyBertNormalizer {
@ -274,7 +271,8 @@ impl PyBertNormalizer {
handle_chinese_chars = true, handle_chinese_chars = true,
strip_accents = None, strip_accents = None,
lowercase = true lowercase = true
))] ),
text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)")]
fn new( fn new(
clean_text: bool, clean_text: bool,
handle_chinese_chars: bool, handle_chinese_chars: bool,
@ -289,11 +287,11 @@ impl PyBertNormalizer {
/// NFD Unicode Normalizer /// NFD Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFD")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFD")]
#[pyo3(text_signature = "(self)")]
pub struct PyNFD {} pub struct PyNFD {}
#[pymethods] #[pymethods]
impl PyNFD { impl PyNFD {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) { fn new() -> (Self, PyNormalizer) {
(PyNFD {}, PyNormalizer::new(NFD.into())) (PyNFD {}, PyNormalizer::new(NFD.into()))
} }
@ -301,11 +299,11 @@ impl PyNFD {
/// NFKD Unicode Normalizer /// NFKD Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKD")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKD")]
#[pyo3(text_signature = "(self)")]
pub struct PyNFKD {} pub struct PyNFKD {}
#[pymethods] #[pymethods]
impl PyNFKD { impl PyNFKD {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) { fn new() -> (Self, PyNormalizer) {
(PyNFKD {}, NFKD.into()) (PyNFKD {}, NFKD.into())
} }
@ -313,11 +311,11 @@ impl PyNFKD {
/// NFC Unicode Normalizer /// NFC Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFC")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFC")]
#[pyo3(text_signature = "(self)")]
pub struct PyNFC {} pub struct PyNFC {}
#[pymethods] #[pymethods]
impl PyNFC { impl PyNFC {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) { fn new() -> (Self, PyNormalizer) {
(PyNFC {}, NFC.into()) (PyNFC {}, NFC.into())
} }
@ -325,11 +323,11 @@ impl PyNFC {
/// NFKC Unicode Normalizer /// NFKC Unicode Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKC")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKC")]
#[pyo3(text_signature = "(self)")]
pub struct PyNFKC {} pub struct PyNFKC {}
#[pymethods] #[pymethods]
impl PyNFKC { impl PyNFKC {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) { fn new() -> (Self, PyNormalizer) {
(PyNFKC {}, NFKC.into()) (PyNFKC {}, NFKC.into())
} }
@ -346,6 +344,7 @@ pub struct PySequence {}
#[pymethods] #[pymethods]
impl PySequence { impl PySequence {
#[new] #[new]
#[pyo3(text_signature = None)]
fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> { fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> {
let mut sequence = Vec::with_capacity(normalizers.len()); let mut sequence = Vec::with_capacity(normalizers.len());
for n in normalizers.iter() { for n in normalizers.iter() {
@ -372,11 +371,11 @@ impl PySequence {
/// Lowercase Normalizer /// Lowercase Normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Lowercase")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Lowercase")]
#[pyo3(text_signature = "(self)")]
pub struct PyLowercase {} pub struct PyLowercase {}
#[pymethods] #[pymethods]
impl PyLowercase { impl PyLowercase {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) { fn new() -> (Self, PyNormalizer) {
(PyLowercase {}, Lowercase.into()) (PyLowercase {}, Lowercase.into())
} }
@ -384,7 +383,6 @@ impl PyLowercase {
/// Strip normalizer /// Strip normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Strip")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Strip")]
#[pyo3(text_signature = "(self, left=True, right=True)")]
pub struct PyStrip {} pub struct PyStrip {}
#[pymethods] #[pymethods]
impl PyStrip { impl PyStrip {
@ -409,7 +407,7 @@ impl PyStrip {
} }
#[new] #[new]
#[pyo3(signature = (left = true, right = true))] #[pyo3(signature = (left = true, right = true), text_signature = "(self, left=True, right=True)")]
fn new(left: bool, right: bool) -> (Self, PyNormalizer) { fn new(left: bool, right: bool) -> (Self, PyNormalizer) {
(PyStrip {}, Strip::new(left, right).into()) (PyStrip {}, Strip::new(left, right).into())
} }
@ -417,7 +415,6 @@ impl PyStrip {
/// Prepend normalizer /// Prepend normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")]
#[pyo3(text_signature = "(self, prepend)")]
pub struct PyPrepend {} pub struct PyPrepend {}
#[pymethods] #[pymethods]
impl PyPrepend { impl PyPrepend {
@ -432,7 +429,7 @@ impl PyPrepend {
} }
#[new] #[new]
#[pyo3(signature = (prepend="".to_string()))] #[pyo3(signature = (prepend="".to_string()), text_signature = "(self, prepend)")]
fn new(prepend: String) -> (Self, PyNormalizer) { fn new(prepend: String) -> (Self, PyNormalizer) {
(PyPrepend {}, Prepend::new(prepend).into()) (PyPrepend {}, Prepend::new(prepend).into())
} }
@ -440,11 +437,11 @@ impl PyPrepend {
/// StripAccents normalizer /// StripAccents normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")]
#[pyo3(text_signature = "(self)")]
pub struct PyStripAccents {} pub struct PyStripAccents {}
#[pymethods] #[pymethods]
impl PyStripAccents { impl PyStripAccents {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) { fn new() -> (Self, PyNormalizer) {
(PyStripAccents {}, StripAccents.into()) (PyStripAccents {}, StripAccents.into())
} }
@ -452,11 +449,11 @@ impl PyStripAccents {
/// Nmt normalizer /// Nmt normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Nmt")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Nmt")]
#[pyo3(text_signature = "(self)")]
pub struct PyNmt {} pub struct PyNmt {}
#[pymethods] #[pymethods]
impl PyNmt { impl PyNmt {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyNormalizer) { fn new() -> (Self, PyNormalizer) {
(PyNmt {}, Nmt.into()) (PyNmt {}, Nmt.into())
} }
@ -465,11 +462,11 @@ impl PyNmt {
/// Precompiled normalizer /// Precompiled normalizer
/// Don't use manually it is used for compatiblity for SentencePiece. /// Don't use manually it is used for compatiblity for SentencePiece.
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
pub struct PyPrecompiled {} pub struct PyPrecompiled {}
#[pymethods] #[pymethods]
impl PyPrecompiled { impl PyPrecompiled {
#[new] #[new]
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> { fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> {
let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?; let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?;
Ok(( Ok((
@ -488,11 +485,11 @@ impl PyPrecompiled {
/// Replace normalizer /// Replace normalizer
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Replace")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Replace")]
#[pyo3(text_signature = "(self, pattern, content)")]
pub struct PyReplace {} pub struct PyReplace {}
#[pymethods] #[pymethods]
impl PyReplace { impl PyReplace {
#[new] #[new]
#[pyo3(text_signature = "(self, pattern, content)")]
fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyNormalizer)> { fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyNormalizer)> {
Ok(( Ok((
PyReplace {}, PyReplace {},

View File

@ -235,7 +235,6 @@ macro_rules! setter {
/// Set this to :obj:`False` to prevent this `pre_tokenizer` from using /// Set this to :obj:`False` to prevent this `pre_tokenizer` from using
/// the GPT2 specific regexp for spliting on whitespace. /// the GPT2 specific regexp for spliting on whitespace.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")]
#[pyo3(text_signature = "(self, add_prefix_space=True, use_regex=True)")]
pub struct PyByteLevel {} pub struct PyByteLevel {}
#[pymethods] #[pymethods]
impl PyByteLevel { impl PyByteLevel {
@ -260,7 +259,7 @@ impl PyByteLevel {
} }
#[new] #[new]
#[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs))] #[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs), text_signature = "(self, add_prefix_space=True, use_regex=True)")]
fn new( fn new(
add_prefix_space: bool, add_prefix_space: bool,
use_regex: bool, use_regex: bool,
@ -295,11 +294,11 @@ impl PyByteLevel {
/// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` /// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")]
#[pyo3(text_signature = "(self)")]
pub struct PyWhitespace {} pub struct PyWhitespace {}
#[pymethods] #[pymethods]
impl PyWhitespace { impl PyWhitespace {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyPreTokenizer) { fn new() -> (Self, PyPreTokenizer) {
(PyWhitespace {}, Whitespace {}.into()) (PyWhitespace {}, Whitespace {}.into())
} }
@ -307,11 +306,11 @@ impl PyWhitespace {
/// This pre-tokenizer simply splits on the whitespace. Works like `.split()` /// This pre-tokenizer simply splits on the whitespace. Works like `.split()`
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "WhitespaceSplit")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "WhitespaceSplit")]
#[pyo3(text_signature = "(self)")]
pub struct PyWhitespaceSplit {} pub struct PyWhitespaceSplit {}
#[pymethods] #[pymethods]
impl PyWhitespaceSplit { impl PyWhitespaceSplit {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyPreTokenizer) { fn new() -> (Self, PyPreTokenizer) {
(PyWhitespaceSplit {}, WhitespaceSplit.into()) (PyWhitespaceSplit {}, WhitespaceSplit.into())
} }
@ -335,12 +334,11 @@ impl PyWhitespaceSplit {
/// invert (:obj:`bool`, `optional`, defaults to :obj:`False`): /// invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
/// Whether to invert the pattern. /// Whether to invert the pattern.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Split")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Split")]
#[pyo3(text_signature = "(self, pattern, behavior, invert=False)")]
pub struct PySplit {} pub struct PySplit {}
#[pymethods] #[pymethods]
impl PySplit { impl PySplit {
#[new] #[new]
#[pyo3(signature = (pattern, behavior, invert = false))] #[pyo3(signature = (pattern, behavior, invert = false), text_signature = "(self, pattern, behavior, invert=False)")]
fn new( fn new(
pattern: PyPattern, pattern: PyPattern,
behavior: PySplitDelimiterBehavior, behavior: PySplitDelimiterBehavior,
@ -379,6 +377,7 @@ impl PyCharDelimiterSplit {
} }
#[new] #[new]
#[pyo3(text_signature = None)]
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> { pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
Ok(( Ok((
PyCharDelimiterSplit {}, PyCharDelimiterSplit {},
@ -396,11 +395,11 @@ impl PyCharDelimiterSplit {
/// This pre-tokenizer splits tokens on spaces, and also on punctuation. /// This pre-tokenizer splits tokens on spaces, and also on punctuation.
/// Each occurence of a punctuation character will be treated separately. /// Each occurence of a punctuation character will be treated separately.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
#[pyo3(text_signature = "(self)")]
pub struct PyBertPreTokenizer {} pub struct PyBertPreTokenizer {}
#[pymethods] #[pymethods]
impl PyBertPreTokenizer { impl PyBertPreTokenizer {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyPreTokenizer) { fn new() -> (Self, PyPreTokenizer) {
(PyBertPreTokenizer {}, BertPreTokenizer.into()) (PyBertPreTokenizer {}, BertPreTokenizer.into())
} }
@ -414,12 +413,11 @@ impl PyBertPreTokenizer {
/// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next", /// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
/// "contiguous" /// "contiguous"
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Punctuation")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Punctuation")]
#[pyo3(text_signature = "(self, behavior=\"isolated\")")]
pub struct PyPunctuation {} pub struct PyPunctuation {}
#[pymethods] #[pymethods]
impl PyPunctuation { impl PyPunctuation {
#[new] #[new]
#[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)))] #[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)), text_signature = "(self, behavior=\"isolated\")")]
fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) { fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
(PyPunctuation {}, Punctuation::new(behavior.into()).into()) (PyPunctuation {}, Punctuation::new(behavior.into()).into())
} }
@ -427,11 +425,11 @@ impl PyPunctuation {
/// This pre-tokenizer composes other pre_tokenizers and applies them in sequence /// This pre-tokenizer composes other pre_tokenizers and applies them in sequence
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")]
#[pyo3(text_signature = "(self, pretokenizers)")]
pub struct PySequence {} pub struct PySequence {}
#[pymethods] #[pymethods]
impl PySequence { impl PySequence {
#[new] #[new]
#[pyo3(text_signature = "(self, pretokenizers)")]
fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> { fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> {
let mut sequence = Vec::with_capacity(pre_tokenizers.len()); let mut sequence = Vec::with_capacity(pre_tokenizers.len());
for n in pre_tokenizers.iter() { for n in pre_tokenizers.iter() {
@ -468,7 +466,6 @@ impl PySequence {
/// Whether to add a space to the first word if there isn't already one. This /// Whether to add a space to the first word if there isn't already one. This
/// lets us treat `hello` exactly like `say hello`. /// lets us treat `hello` exactly like `say hello`.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")]
#[pyo3(text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
pub struct PyMetaspace {} pub struct PyMetaspace {}
#[pymethods] #[pymethods]
impl PyMetaspace { impl PyMetaspace {
@ -493,7 +490,7 @@ impl PyMetaspace {
} }
#[new] #[new]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs))] #[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
fn new( fn new(
replacement: PyChar, replacement: PyChar,
add_prefix_space: bool, add_prefix_space: bool,
@ -518,7 +515,6 @@ impl PyMetaspace {
/// ///
/// "Call 123 please" -> "Call ", "123", " please" /// "Call 123 please" -> "Call ", "123", " please"
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Digits")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Digits")]
#[pyo3(text_signature = "(self, individual_digits=False)")]
pub struct PyDigits {} pub struct PyDigits {}
#[pymethods] #[pymethods]
impl PyDigits { impl PyDigits {
@ -533,7 +529,7 @@ impl PyDigits {
} }
#[new] #[new]
#[pyo3(signature = (individual_digits = false))] #[pyo3(signature = (individual_digits = false), text_signature = "(self, individual_digits=False)")]
fn new(individual_digits: bool) -> (Self, PyPreTokenizer) { fn new(individual_digits: bool) -> (Self, PyPreTokenizer) {
(PyDigits {}, Digits::new(individual_digits).into()) (PyDigits {}, Digits::new(individual_digits).into())
} }
@ -544,11 +540,11 @@ impl PyDigits {
/// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. /// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
/// This mimicks SentencePiece Unigram implementation. /// This mimicks SentencePiece Unigram implementation.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "UnicodeScripts")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "UnicodeScripts")]
#[pyo3(text_signature = "(self)")]
pub struct PyUnicodeScripts {} pub struct PyUnicodeScripts {}
#[pymethods] #[pymethods]
impl PyUnicodeScripts { impl PyUnicodeScripts {
#[new] #[new]
#[pyo3(text_signature = "(self)")]
fn new() -> (Self, PyPreTokenizer) { fn new() -> (Self, PyPreTokenizer) {
(PyUnicodeScripts {}, UnicodeScripts::new().into()) (PyUnicodeScripts {}, UnicodeScripts::new().into())
} }

View File

@ -154,11 +154,11 @@ impl PyPostProcessor {
/// cls (:obj:`Tuple[str, int]`): /// cls (:obj:`Tuple[str, int]`):
/// A tuple with the string representation of the CLS token, and its id /// A tuple with the string representation of the CLS token, and its id
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "BertProcessing")] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "BertProcessing")]
#[pyo3(text_signature = "(self, sep, cls)")]
pub struct PyBertProcessing {} pub struct PyBertProcessing {}
#[pymethods] #[pymethods]
impl PyBertProcessing { impl PyBertProcessing {
#[new] #[new]
#[pyo3(text_signature = "(self, sep, cls)")]
fn new(sep: (String, u32), cls: (String, u32)) -> (Self, PyPostProcessor) { fn new(sep: (String, u32), cls: (String, u32)) -> (Self, PyPostProcessor) {
( (
PyBertProcessing {}, PyBertProcessing {},
@ -196,12 +196,11 @@ impl PyBertProcessing {
/// Whether the add_prefix_space option was enabled during pre-tokenization. This /// Whether the add_prefix_space option was enabled during pre-tokenization. This
/// is relevant because it defines the way the offsets are trimmed out. /// is relevant because it defines the way the offsets are trimmed out.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "RobertaProcessing")] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "RobertaProcessing")]
#[pyo3(text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")]
pub struct PyRobertaProcessing {} pub struct PyRobertaProcessing {}
#[pymethods] #[pymethods]
impl PyRobertaProcessing { impl PyRobertaProcessing {
#[new] #[new]
#[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true))] #[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true), text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")]
fn new( fn new(
sep: (String, u32), sep: (String, u32),
cls: (String, u32), cls: (String, u32),
@ -231,12 +230,11 @@ impl PyRobertaProcessing {
/// trim_offsets (:obj:`bool`): /// trim_offsets (:obj:`bool`):
/// Whether to trim the whitespaces from the produced offsets. /// Whether to trim the whitespaces from the produced offsets.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
#[pyo3(text_signature = "(self, trim_offsets=True)")]
pub struct PyByteLevel {} pub struct PyByteLevel {}
#[pymethods] #[pymethods]
impl PyByteLevel { impl PyByteLevel {
#[new] #[new]
#[pyo3(signature = (trim_offsets = None, **_kwargs))] #[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) { fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
let mut byte_level = ByteLevel::default(); let mut byte_level = ByteLevel::default();
@ -383,12 +381,11 @@ impl FromPyObject<'_> for PyTemplate {
/// The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have /// The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
/// the same length. /// the same length.
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "TemplateProcessing")] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "TemplateProcessing")]
#[pyo3(text_signature = "(self, single, pair, special_tokens)")]
pub struct PyTemplateProcessing {} pub struct PyTemplateProcessing {}
#[pymethods] #[pymethods]
impl PyTemplateProcessing { impl PyTemplateProcessing {
#[new] #[new]
#[pyo3(signature = (single = None, pair = None, special_tokens = None))] #[pyo3(signature = (single = None, pair = None, special_tokens = None), text_signature = "(self, single, pair, special_tokens)")]
fn new( fn new(
single: Option<PyTemplate>, single: Option<PyTemplate>,
pair: Option<PyTemplate>, pair: Option<PyTemplate>,
@ -422,12 +419,11 @@ impl PyTemplateProcessing {
/// processors (:obj:`List[PostProcessor]`) /// processors (:obj:`List[PostProcessor]`)
/// The processors that need to be chained /// The processors that need to be chained
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "Sequence")] #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "Sequence")]
#[pyo3(text_signature = "(self, processors)")]
pub struct PySequence {} pub struct PySequence {}
#[pymethods] #[pymethods]
impl PySequence { impl PySequence {
#[new] #[new]
#[pyo3(signature = (processors_py))] #[pyo3(signature = (processors_py), text_signature = "(self, processors)")]
fn new(processors_py: &PyList) -> (Self, PyPostProcessor) { fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len()); let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
for n in processors_py.iter() { for n in processors_py.iter() {

View File

@ -20,6 +20,7 @@ impl From<PyToken> for Token {
#[pymethods] #[pymethods]
impl PyToken { impl PyToken {
#[new] #[new]
#[pyo3(text_signature = None)]
fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken { fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken {
Token::new(id, value, offsets).into() Token::new(id, value, offsets).into()
} }

View File

@ -56,9 +56,6 @@ use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
/// Yesterday"``. /// Yesterday"``.
/// ///
#[pyclass(dict, module = "tokenizers", name = "AddedToken")] #[pyclass(dict, module = "tokenizers", name = "AddedToken")]
#[pyo3(
text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"
)]
pub struct PyAddedToken { pub struct PyAddedToken {
pub content: String, pub content: String,
pub is_special_token: bool, pub is_special_token: bool,
@ -128,7 +125,7 @@ impl From<tk::AddedToken> for PyAddedToken {
#[pymethods] #[pymethods]
impl PyAddedToken { impl PyAddedToken {
#[new] #[new]
#[pyo3(signature = (content=None, **kwargs))] #[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")]
fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> { fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
let mut token = PyAddedToken::from(content.unwrap_or(""), None); let mut token = PyAddedToken::from(content.unwrap_or(""), None);
@ -441,7 +438,6 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
/// The core algorithm that this :obj:`Tokenizer` should be using. /// The core algorithm that this :obj:`Tokenizer` should be using.
/// ///
#[pyclass(dict, module = "tokenizers", name = "Tokenizer")] #[pyclass(dict, module = "tokenizers", name = "Tokenizer")]
#[pyo3(text_signature = "(self, model)")]
#[derive(Clone)] #[derive(Clone)]
pub struct PyTokenizer { pub struct PyTokenizer {
tokenizer: Tokenizer, tokenizer: Tokenizer,
@ -460,6 +456,7 @@ impl PyTokenizer {
#[pymethods] #[pymethods]
impl PyTokenizer { impl PyTokenizer {
#[new] #[new]
#[pyo3(text_signature = "(self, model)")]
fn __new__(model: PyRef<PyModel>) -> Self { fn __new__(model: PyRef<PyModel>) -> Self {
PyTokenizer::from_model(model.clone()) PyTokenizer::from_model(model.clone())
} }

View File

@ -299,7 +299,7 @@ impl PyBpeTrainer {
} }
#[new] #[new]
#[pyo3(signature = (**kwargs))] #[pyo3(signature = (**kwargs), text_signature = None)]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut builder = tk::models::bpe::BpeTrainer::builder(); let mut builder = tk::models::bpe::BpeTrainer::builder();
if let Some(kwargs) = kwargs { if let Some(kwargs) = kwargs {
@ -383,9 +383,6 @@ impl PyBpeTrainer {
/// end_of_word_suffix (:obj:`str`, `optional`): /// end_of_word_suffix (:obj:`str`, `optional`):
/// A suffix to be used for every subword that is a end-of-word. /// A suffix to be used for every subword that is a end-of-word.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")] #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")]
#[pyo3(
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
)]
pub struct PyWordPieceTrainer {} pub struct PyWordPieceTrainer {}
#[pymethods] #[pymethods]
impl PyWordPieceTrainer { impl PyWordPieceTrainer {
@ -506,7 +503,10 @@ impl PyWordPieceTrainer {
} }
#[new] #[new]
#[pyo3(signature = (** kwargs))] #[pyo3(
signature = (** kwargs),
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
)]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder(); let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
if let Some(kwargs) = kwargs { if let Some(kwargs) = kwargs {
@ -646,7 +646,7 @@ impl PyWordLevelTrainer {
} }
#[new] #[new]
#[pyo3(signature = (**kwargs))] #[pyo3(signature = (**kwargs), text_signature = None)]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder(); let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
@ -731,9 +731,6 @@ impl PyWordLevelTrainer {
/// The number of iterations of the EM algorithm to perform before /// The number of iterations of the EM algorithm to perform before
/// pruning the vocabulary. /// pruning the vocabulary.
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")] #[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")]
#[pyo3(
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
)]
pub struct PyUnigramTrainer {} pub struct PyUnigramTrainer {}
#[pymethods] #[pymethods]
impl PyUnigramTrainer { impl PyUnigramTrainer {
@ -814,7 +811,10 @@ impl PyUnigramTrainer {
} }
#[new] #[new]
#[pyo3(signature = (**kwargs))] #[pyo3(
signature = (**kwargs),
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
)]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> { pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
let mut builder = tk::models::unigram::UnigramTrainer::builder(); let mut builder = tk::models::unigram::UnigramTrainer::builder();
if let Some(kwargs) = kwargs { if let Some(kwargs) = kwargs {

View File

@ -200,6 +200,7 @@ pub struct PyNormalizedString {
#[pymethods] #[pymethods]
impl PyNormalizedString { impl PyNormalizedString {
#[new] #[new]
#[pyo3(text_signature = None)]
fn new(s: &str) -> Self { fn new(s: &str) -> Self {
NormalizedString::from(s).into() NormalizedString::from(s).into()
} }

View File

@ -148,7 +148,6 @@ fn to_encoding(
/// sequence: str: /// sequence: str:
/// The string sequence used to initialize this PreTokenizedString /// The string sequence used to initialize this PreTokenizedString
#[pyclass(module = "tokenizers", name = "PreTokenizedString")] #[pyclass(module = "tokenizers", name = "PreTokenizedString")]
#[pyo3(text_signature = "(self, sequence)")]
pub struct PyPreTokenizedString { pub struct PyPreTokenizedString {
pub(crate) pretok: tk::PreTokenizedString, pub(crate) pretok: tk::PreTokenizedString,
} }
@ -168,6 +167,7 @@ impl From<PyPreTokenizedString> for PreTokenizedString {
#[pymethods] #[pymethods]
impl PyPreTokenizedString { impl PyPreTokenizedString {
#[new] #[new]
#[pyo3(text_signature = "(self, sequence)")]
fn new(s: &str) -> Self { fn new(s: &str) -> Self {
PreTokenizedString::from(s).into() PreTokenizedString::from(s).into()
} }

View File

@ -4,7 +4,6 @@ use pyo3::prelude::*;
/// Instantiate a new Regex with the given pattern /// Instantiate a new Regex with the given pattern
#[pyclass(module = "tokenizers", name = "Regex")] #[pyclass(module = "tokenizers", name = "Regex")]
#[pyo3(text_signature = "(self, pattern)")]
pub struct PyRegex { pub struct PyRegex {
pub inner: Regex, pub inner: Regex,
pub pattern: String, pub pattern: String,
@ -13,6 +12,7 @@ pub struct PyRegex {
#[pymethods] #[pymethods]
impl PyRegex { impl PyRegex {
#[new] #[new]
#[pyo3(text_signature = "(self, pattern)")]
fn new(s: &str) -> PyResult<Self> { fn new(s: &str) -> PyResult<Self> {
Ok(Self { Ok(Self {
inner: Regex::new(s) inner: Regex::new(s)