mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
pyo3: update to 0.19 (#1322)
* Bump pyo3 dependency versions * Fix deprecation warnings from pyo3 --------- Co-authored-by: Mike Lui <mikelui@meta.com>
This commit is contained in:
@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
|
||||
serde_json = "1.0"
|
||||
libc = "0.2"
|
||||
env_logger = "0.7.1"
|
||||
pyo3 = "0.18.1"
|
||||
numpy = "0.18.0"
|
||||
pyo3 = "0.19"
|
||||
numpy = "0.19.0"
|
||||
ndarray = "0.13"
|
||||
onig = { version = "6.0", default-features = false }
|
||||
itertools = "0.9"
|
||||
@ -26,7 +26,7 @@ path = "../../tokenizers"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.1"
|
||||
pyo3 = { version = "0.18.1", features = ["auto-initialize"] }
|
||||
pyo3 = { version = "0.19", features = ["auto-initialize"] }
|
||||
|
||||
[features]
|
||||
default = ["pyo3/extension-module"]
|
||||
|
@ -155,12 +155,11 @@ macro_rules! setter {
|
||||
/// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel`
|
||||
/// :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteLevel")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyByteLevelDec {}
|
||||
#[pymethods]
|
||||
impl PyByteLevelDec {
|
||||
#[new]
|
||||
#[pyo3(signature = (**_kwargs))]
|
||||
#[pyo3(signature = (**_kwargs), text_signature = "(self)")]
|
||||
fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) {
|
||||
(PyByteLevelDec {}, ByteLevel::default().into())
|
||||
}
|
||||
@ -171,11 +170,11 @@ impl PyByteLevelDec {
|
||||
/// This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace`
|
||||
/// :class:`~tokenizers.pre_tokenizers.PreTokenizer`.
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Replace")]
|
||||
#[pyo3(text_signature = "(self, pattern, content)")]
|
||||
pub struct PyReplaceDec {}
|
||||
#[pymethods]
|
||||
impl PyReplaceDec {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, pattern, content)")]
|
||||
fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyDecoder)> {
|
||||
Ok((
|
||||
PyReplaceDec {},
|
||||
@ -194,7 +193,6 @@ impl PyReplaceDec {
|
||||
/// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
/// and some abbreviated english forms.
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "WordPiece")]
|
||||
#[pyo3(text_signature = "(self, prefix=\"##\", cleanup=True)")]
|
||||
pub struct PyWordPieceDec {}
|
||||
#[pymethods]
|
||||
impl PyWordPieceDec {
|
||||
@ -219,7 +217,7 @@ impl PyWordPieceDec {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (prefix = String::from("##"), cleanup = true))]
|
||||
#[pyo3(signature = (prefix = String::from("##"), cleanup = true), text_signature = "(self, prefix=\"##\", cleanup=True)")]
|
||||
fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) {
|
||||
(PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into())
|
||||
}
|
||||
@ -231,12 +229,11 @@ impl PyWordPieceDec {
|
||||
/// cannot be decoded you will get <20> instead for each inconvertable byte token
|
||||
///
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyByteFallbackDec {}
|
||||
#[pymethods]
|
||||
impl PyByteFallbackDec {
|
||||
#[new]
|
||||
#[pyo3(signature = ())]
|
||||
#[pyo3(signature = (), text_signature = "(self)")]
|
||||
fn new() -> (Self, PyDecoder) {
|
||||
(PyByteFallbackDec {}, ByteFallback::new().into())
|
||||
}
|
||||
@ -247,12 +244,11 @@ impl PyByteFallbackDec {
|
||||
/// This is the last step of decoding, this decoder exists only if
|
||||
/// there is need to add other decoders *after* the fusion
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Fuse")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyFuseDec {}
|
||||
#[pymethods]
|
||||
impl PyFuseDec {
|
||||
#[new]
|
||||
#[pyo3(signature = ())]
|
||||
#[pyo3(signature = (), text_signature = "(self)")]
|
||||
fn new() -> (Self, PyDecoder) {
|
||||
(PyFuseDec {}, Fuse::new().into())
|
||||
}
|
||||
@ -261,7 +257,6 @@ impl PyFuseDec {
|
||||
/// Strip normalizer
|
||||
/// Strips n left characters of each token, or n right characters of each token
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Strip")]
|
||||
#[pyo3(text_signature = "(self, content, left=0, right=0)")]
|
||||
pub struct PyStrip {}
|
||||
#[pymethods]
|
||||
impl PyStrip {
|
||||
@ -296,7 +291,7 @@ impl PyStrip {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (content=' ', left=0, right=0))]
|
||||
#[pyo3(signature = (content=' ', left=0, right=0), text_signature = "(self, content, left=0, right=0)")]
|
||||
fn new(content: char, left: usize, right: usize) -> (Self, PyDecoder) {
|
||||
(PyStrip {}, Strip::new(content, left, right).into())
|
||||
}
|
||||
@ -313,7 +308,6 @@ impl PyStrip {
|
||||
/// Whether to add a space to the first word if there isn't already one. This
|
||||
/// lets us treat `hello` exactly like `say hello`.
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Metaspace")]
|
||||
#[pyo3(text_signature = "(self, replacement = \"▁\", add_prefix_space = True)")]
|
||||
pub struct PyMetaspaceDec {}
|
||||
#[pymethods]
|
||||
impl PyMetaspaceDec {
|
||||
@ -338,7 +332,7 @@ impl PyMetaspaceDec {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true))]
|
||||
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true), text_signature = "(self, replacement = \"▁\", add_prefix_space = True)")]
|
||||
fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
|
||||
(
|
||||
PyMetaspaceDec {},
|
||||
@ -354,7 +348,6 @@ impl PyMetaspaceDec {
|
||||
/// The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
/// be replaced by whitespaces during the decoding
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
|
||||
#[pyo3(text_signature = "(self, suffix=\"</w>\")")]
|
||||
pub struct PyBPEDecoder {}
|
||||
#[pymethods]
|
||||
impl PyBPEDecoder {
|
||||
@ -369,7 +362,7 @@ impl PyBPEDecoder {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (suffix = String::from("</w>")))]
|
||||
#[pyo3(signature = (suffix = String::from("</w>")), text_signature = "(self, suffix=\"</w>\")")]
|
||||
fn new(suffix: String) -> (Self, PyDecoder) {
|
||||
(PyBPEDecoder {}, BPEDecoder::new(suffix).into())
|
||||
}
|
||||
@ -386,7 +379,6 @@ impl PyBPEDecoder {
|
||||
/// Whether to cleanup some tokenization artifacts.
|
||||
/// Mainly spaces before punctuation, and some abbreviated english forms.
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "CTC")]
|
||||
#[pyo3(text_signature = "(self, pad_token=\"<pad>\", word_delimiter_token=\"|\", cleanup=True)")]
|
||||
pub struct PyCTCDecoder {}
|
||||
#[pymethods]
|
||||
impl PyCTCDecoder {
|
||||
@ -425,7 +417,8 @@ impl PyCTCDecoder {
|
||||
pad_token = String::from("<pad>"),
|
||||
word_delimiter_token = String::from("|"),
|
||||
cleanup = true
|
||||
))]
|
||||
),
|
||||
text_signature = "(self, pad_token=\"<pad>\", word_delimiter_token=\"|\", cleanup=True)")]
|
||||
fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) {
|
||||
(
|
||||
PyCTCDecoder {},
|
||||
@ -440,12 +433,11 @@ impl PyCTCDecoder {
|
||||
/// decoders (:obj:`List[Decoder]`)
|
||||
/// The decoders that need to be chained
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name="Sequence")]
|
||||
#[pyo3(text_signature = "(self, decoders)")]
|
||||
pub struct PySequenceDecoder {}
|
||||
#[pymethods]
|
||||
impl PySequenceDecoder {
|
||||
#[new]
|
||||
#[pyo3(signature = (decoders_py))]
|
||||
#[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")]
|
||||
fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
|
||||
let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
|
||||
for decoder_py in decoders_py.iter() {
|
||||
|
@ -23,6 +23,7 @@ impl From<tk::tokenizer::Encoding> for PyEncoding {
|
||||
#[pymethods]
|
||||
impl PyEncoding {
|
||||
#[new]
|
||||
#[pyo3(text_signature = None)]
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
encoding: tk::tokenizer::Encoding::default(),
|
||||
|
@ -89,6 +89,7 @@ where
|
||||
#[pymethods]
|
||||
impl PyModel {
|
||||
#[new]
|
||||
#[pyo3(text_signature = None)]
|
||||
fn __new__() -> Self {
|
||||
// Instantiate a default empty model. This doesn't really make sense, but we need
|
||||
// to be able to instantiate an empty model for pickle capabilities.
|
||||
@ -253,9 +254,6 @@ impl PyModel {
|
||||
/// byte_fallback (:obj:`bool`, `optional`):
|
||||
/// Whether to use spm byte-fallback trick (defaults to False)
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "BPE")]
|
||||
#[pyo3(
|
||||
text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)"
|
||||
)]
|
||||
pub struct PyBPE {}
|
||||
|
||||
impl PyBPE {
|
||||
@ -400,7 +398,9 @@ impl PyBPE {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (vocab=None, merges=None, **kwargs))]
|
||||
#[pyo3(
|
||||
signature = (vocab=None, merges=None, **kwargs),
|
||||
text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None, byte_fallback=False)")]
|
||||
fn new(
|
||||
py: Python<'_>,
|
||||
vocab: Option<PyVocab>,
|
||||
@ -523,7 +523,6 @@ impl PyBPE {
|
||||
/// max_input_chars_per_word (:obj:`int`, `optional`):
|
||||
/// The maximum number of characters to authorize in a single word.
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordPiece")]
|
||||
#[pyo3(text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
|
||||
pub struct PyWordPiece {}
|
||||
|
||||
impl PyWordPiece {
|
||||
@ -597,7 +596,7 @@ impl PyWordPiece {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (vocab=None, **kwargs))]
|
||||
#[pyo3(signature = (vocab=None, **kwargs), text_signature = "(self, vocab, unk_token, max_input_chars_per_word)")]
|
||||
fn new(
|
||||
py: Python<'_>,
|
||||
vocab: Option<PyVocab>,
|
||||
@ -692,7 +691,6 @@ impl PyWordPiece {
|
||||
/// unk_token (:obj:`str`, `optional`):
|
||||
/// The unknown token to be used by the model.
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "WordLevel")]
|
||||
#[pyo3(text_signature = "(self, vocab, unk_token)")]
|
||||
pub struct PyWordLevel {}
|
||||
|
||||
#[pymethods]
|
||||
@ -708,7 +706,7 @@ impl PyWordLevel {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (vocab=None, unk_token = None))]
|
||||
#[pyo3(signature = (vocab=None, unk_token = None), text_signature = "(self, vocab, unk_token)")]
|
||||
fn new(
|
||||
py: Python<'_>,
|
||||
vocab: Option<PyVocab>,
|
||||
@ -807,12 +805,12 @@ impl PyWordLevel {
|
||||
/// vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`):
|
||||
/// A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
#[pyclass(extends=PyModel, module = "tokenizers.models", name = "Unigram")]
|
||||
#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
|
||||
pub struct PyUnigram {}
|
||||
|
||||
#[pymethods]
|
||||
impl PyUnigram {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, vocab, unk_id, byte_fallback)")]
|
||||
fn new(
|
||||
vocab: Option<Vec<(String, f64)>>,
|
||||
unk_id: Option<usize>,
|
||||
|
@ -217,9 +217,6 @@ macro_rules! setter {
|
||||
/// lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
/// Whether to lowercase.
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "BertNormalizer")]
|
||||
#[pyo3(
|
||||
text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"
|
||||
)]
|
||||
pub struct PyBertNormalizer {}
|
||||
#[pymethods]
|
||||
impl PyBertNormalizer {
|
||||
@ -274,7 +271,8 @@ impl PyBertNormalizer {
|
||||
handle_chinese_chars = true,
|
||||
strip_accents = None,
|
||||
lowercase = true
|
||||
))]
|
||||
),
|
||||
text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)")]
|
||||
fn new(
|
||||
clean_text: bool,
|
||||
handle_chinese_chars: bool,
|
||||
@ -289,11 +287,11 @@ impl PyBertNormalizer {
|
||||
|
||||
/// NFD Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFD")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNFD {}
|
||||
#[pymethods]
|
||||
impl PyNFD {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNFD {}, PyNormalizer::new(NFD.into()))
|
||||
}
|
||||
@ -301,11 +299,11 @@ impl PyNFD {
|
||||
|
||||
/// NFKD Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKD")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNFKD {}
|
||||
#[pymethods]
|
||||
impl PyNFKD {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNFKD {}, NFKD.into())
|
||||
}
|
||||
@ -313,11 +311,11 @@ impl PyNFKD {
|
||||
|
||||
/// NFC Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFC")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNFC {}
|
||||
#[pymethods]
|
||||
impl PyNFC {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNFC {}, NFC.into())
|
||||
}
|
||||
@ -325,11 +323,11 @@ impl PyNFC {
|
||||
|
||||
/// NFKC Unicode Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "NFKC")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNFKC {}
|
||||
#[pymethods]
|
||||
impl PyNFKC {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNFKC {}, NFKC.into())
|
||||
}
|
||||
@ -346,6 +344,7 @@ pub struct PySequence {}
|
||||
#[pymethods]
|
||||
impl PySequence {
|
||||
#[new]
|
||||
#[pyo3(text_signature = None)]
|
||||
fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> {
|
||||
let mut sequence = Vec::with_capacity(normalizers.len());
|
||||
for n in normalizers.iter() {
|
||||
@ -372,11 +371,11 @@ impl PySequence {
|
||||
|
||||
/// Lowercase Normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Lowercase")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyLowercase {}
|
||||
#[pymethods]
|
||||
impl PyLowercase {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyLowercase {}, Lowercase.into())
|
||||
}
|
||||
@ -384,7 +383,6 @@ impl PyLowercase {
|
||||
|
||||
/// Strip normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Strip")]
|
||||
#[pyo3(text_signature = "(self, left=True, right=True)")]
|
||||
pub struct PyStrip {}
|
||||
#[pymethods]
|
||||
impl PyStrip {
|
||||
@ -409,7 +407,7 @@ impl PyStrip {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (left = true, right = true))]
|
||||
#[pyo3(signature = (left = true, right = true), text_signature = "(self, left=True, right=True)")]
|
||||
fn new(left: bool, right: bool) -> (Self, PyNormalizer) {
|
||||
(PyStrip {}, Strip::new(left, right).into())
|
||||
}
|
||||
@ -417,7 +415,6 @@ impl PyStrip {
|
||||
|
||||
/// Prepend normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Prepend")]
|
||||
#[pyo3(text_signature = "(self, prepend)")]
|
||||
pub struct PyPrepend {}
|
||||
#[pymethods]
|
||||
impl PyPrepend {
|
||||
@ -432,7 +429,7 @@ impl PyPrepend {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (prepend="▁".to_string()))]
|
||||
#[pyo3(signature = (prepend="▁".to_string()), text_signature = "(self, prepend)")]
|
||||
fn new(prepend: String) -> (Self, PyNormalizer) {
|
||||
(PyPrepend {}, Prepend::new(prepend).into())
|
||||
}
|
||||
@ -440,11 +437,11 @@ impl PyPrepend {
|
||||
|
||||
/// StripAccents normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "StripAccents")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyStripAccents {}
|
||||
#[pymethods]
|
||||
impl PyStripAccents {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyStripAccents {}, StripAccents.into())
|
||||
}
|
||||
@ -452,11 +449,11 @@ impl PyStripAccents {
|
||||
|
||||
/// Nmt normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Nmt")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyNmt {}
|
||||
#[pymethods]
|
||||
impl PyNmt {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyNormalizer) {
|
||||
(PyNmt {}, Nmt.into())
|
||||
}
|
||||
@ -465,11 +462,11 @@ impl PyNmt {
|
||||
/// Precompiled normalizer
|
||||
/// Don't use manually it is used for compatiblity for SentencePiece.
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
|
||||
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
|
||||
pub struct PyPrecompiled {}
|
||||
#[pymethods]
|
||||
impl PyPrecompiled {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
|
||||
fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> {
|
||||
let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?;
|
||||
Ok((
|
||||
@ -488,11 +485,11 @@ impl PyPrecompiled {
|
||||
|
||||
/// Replace normalizer
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Replace")]
|
||||
#[pyo3(text_signature = "(self, pattern, content)")]
|
||||
pub struct PyReplace {}
|
||||
#[pymethods]
|
||||
impl PyReplace {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, pattern, content)")]
|
||||
fn new(pattern: PyPattern, content: String) -> PyResult<(Self, PyNormalizer)> {
|
||||
Ok((
|
||||
PyReplace {},
|
||||
|
@ -235,7 +235,6 @@ macro_rules! setter {
|
||||
/// Set this to :obj:`False` to prevent this `pre_tokenizer` from using
|
||||
/// the GPT2 specific regexp for spliting on whitespace.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "ByteLevel")]
|
||||
#[pyo3(text_signature = "(self, add_prefix_space=True, use_regex=True)")]
|
||||
pub struct PyByteLevel {}
|
||||
#[pymethods]
|
||||
impl PyByteLevel {
|
||||
@ -260,7 +259,7 @@ impl PyByteLevel {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs))]
|
||||
#[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs), text_signature = "(self, add_prefix_space=True, use_regex=True)")]
|
||||
fn new(
|
||||
add_prefix_space: bool,
|
||||
use_regex: bool,
|
||||
@ -295,11 +294,11 @@ impl PyByteLevel {
|
||||
|
||||
/// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Whitespace")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyWhitespace {}
|
||||
#[pymethods]
|
||||
impl PyWhitespace {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyWhitespace {}, Whitespace {}.into())
|
||||
}
|
||||
@ -307,11 +306,11 @@ impl PyWhitespace {
|
||||
|
||||
/// This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "WhitespaceSplit")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyWhitespaceSplit {}
|
||||
#[pymethods]
|
||||
impl PyWhitespaceSplit {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyWhitespaceSplit {}, WhitespaceSplit.into())
|
||||
}
|
||||
@ -335,12 +334,11 @@ impl PyWhitespaceSplit {
|
||||
/// invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
/// Whether to invert the pattern.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Split")]
|
||||
#[pyo3(text_signature = "(self, pattern, behavior, invert=False)")]
|
||||
pub struct PySplit {}
|
||||
#[pymethods]
|
||||
impl PySplit {
|
||||
#[new]
|
||||
#[pyo3(signature = (pattern, behavior, invert = false))]
|
||||
#[pyo3(signature = (pattern, behavior, invert = false), text_signature = "(self, pattern, behavior, invert=False)")]
|
||||
fn new(
|
||||
pattern: PyPattern,
|
||||
behavior: PySplitDelimiterBehavior,
|
||||
@ -379,6 +377,7 @@ impl PyCharDelimiterSplit {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(text_signature = None)]
|
||||
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((
|
||||
PyCharDelimiterSplit {},
|
||||
@ -396,11 +395,11 @@ impl PyCharDelimiterSplit {
|
||||
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
/// Each occurence of a punctuation character will be treated separately.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyBertPreTokenizer {}
|
||||
#[pymethods]
|
||||
impl PyBertPreTokenizer {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyBertPreTokenizer {}, BertPreTokenizer.into())
|
||||
}
|
||||
@ -414,12 +413,11 @@ impl PyBertPreTokenizer {
|
||||
/// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
||||
/// "contiguous"
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Punctuation")]
|
||||
#[pyo3(text_signature = "(self, behavior=\"isolated\")")]
|
||||
pub struct PyPunctuation {}
|
||||
#[pymethods]
|
||||
impl PyPunctuation {
|
||||
#[new]
|
||||
#[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)))]
|
||||
#[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)), text_signature = "(self, behavior=\"isolated\")")]
|
||||
fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
|
||||
(PyPunctuation {}, Punctuation::new(behavior.into()).into())
|
||||
}
|
||||
@ -427,11 +425,11 @@ impl PyPunctuation {
|
||||
|
||||
/// This pre-tokenizer composes other pre_tokenizers and applies them in sequence
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Sequence")]
|
||||
#[pyo3(text_signature = "(self, pretokenizers)")]
|
||||
pub struct PySequence {}
|
||||
#[pymethods]
|
||||
impl PySequence {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, pretokenizers)")]
|
||||
fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
|
||||
for n in pre_tokenizers.iter() {
|
||||
@ -468,7 +466,6 @@ impl PySequence {
|
||||
/// Whether to add a space to the first word if there isn't already one. This
|
||||
/// lets us treat `hello` exactly like `say hello`.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Metaspace")]
|
||||
#[pyo3(text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
|
||||
pub struct PyMetaspace {}
|
||||
#[pymethods]
|
||||
impl PyMetaspace {
|
||||
@ -493,7 +490,7 @@ impl PyMetaspace {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs))]
|
||||
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
|
||||
fn new(
|
||||
replacement: PyChar,
|
||||
add_prefix_space: bool,
|
||||
@ -518,7 +515,6 @@ impl PyMetaspace {
|
||||
///
|
||||
/// "Call 123 please" -> "Call ", "123", " please"
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "Digits")]
|
||||
#[pyo3(text_signature = "(self, individual_digits=False)")]
|
||||
pub struct PyDigits {}
|
||||
#[pymethods]
|
||||
impl PyDigits {
|
||||
@ -533,7 +529,7 @@ impl PyDigits {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (individual_digits = false))]
|
||||
#[pyo3(signature = (individual_digits = false), text_signature = "(self, individual_digits=False)")]
|
||||
fn new(individual_digits: bool) -> (Self, PyPreTokenizer) {
|
||||
(PyDigits {}, Digits::new(individual_digits).into())
|
||||
}
|
||||
@ -544,11 +540,11 @@ impl PyDigits {
|
||||
/// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
||||
/// This mimicks SentencePiece Unigram implementation.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "UnicodeScripts")]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
pub struct PyUnicodeScripts {}
|
||||
#[pymethods]
|
||||
impl PyUnicodeScripts {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self)")]
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyUnicodeScripts {}, UnicodeScripts::new().into())
|
||||
}
|
||||
|
@ -154,11 +154,11 @@ impl PyPostProcessor {
|
||||
/// cls (:obj:`Tuple[str, int]`):
|
||||
/// A tuple with the string representation of the CLS token, and its id
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "BertProcessing")]
|
||||
#[pyo3(text_signature = "(self, sep, cls)")]
|
||||
pub struct PyBertProcessing {}
|
||||
#[pymethods]
|
||||
impl PyBertProcessing {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, sep, cls)")]
|
||||
fn new(sep: (String, u32), cls: (String, u32)) -> (Self, PyPostProcessor) {
|
||||
(
|
||||
PyBertProcessing {},
|
||||
@ -196,12 +196,11 @@ impl PyBertProcessing {
|
||||
/// Whether the add_prefix_space option was enabled during pre-tokenization. This
|
||||
/// is relevant because it defines the way the offsets are trimmed out.
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "RobertaProcessing")]
|
||||
#[pyo3(text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")]
|
||||
pub struct PyRobertaProcessing {}
|
||||
#[pymethods]
|
||||
impl PyRobertaProcessing {
|
||||
#[new]
|
||||
#[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true))]
|
||||
#[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true), text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)")]
|
||||
fn new(
|
||||
sep: (String, u32),
|
||||
cls: (String, u32),
|
||||
@ -231,12 +230,11 @@ impl PyRobertaProcessing {
|
||||
/// trim_offsets (:obj:`bool`):
|
||||
/// Whether to trim the whitespaces from the produced offsets.
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "ByteLevel")]
|
||||
#[pyo3(text_signature = "(self, trim_offsets=True)")]
|
||||
pub struct PyByteLevel {}
|
||||
#[pymethods]
|
||||
impl PyByteLevel {
|
||||
#[new]
|
||||
#[pyo3(signature = (trim_offsets = None, **_kwargs))]
|
||||
#[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
|
||||
fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
|
||||
let mut byte_level = ByteLevel::default();
|
||||
|
||||
@ -383,12 +381,11 @@ impl FromPyObject<'_> for PyTemplate {
|
||||
/// The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have
|
||||
/// the same length.
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "TemplateProcessing")]
|
||||
#[pyo3(text_signature = "(self, single, pair, special_tokens)")]
|
||||
pub struct PyTemplateProcessing {}
|
||||
#[pymethods]
|
||||
impl PyTemplateProcessing {
|
||||
#[new]
|
||||
#[pyo3(signature = (single = None, pair = None, special_tokens = None))]
|
||||
#[pyo3(signature = (single = None, pair = None, special_tokens = None), text_signature = "(self, single, pair, special_tokens)")]
|
||||
fn new(
|
||||
single: Option<PyTemplate>,
|
||||
pair: Option<PyTemplate>,
|
||||
@ -422,12 +419,11 @@ impl PyTemplateProcessing {
|
||||
/// processors (:obj:`List[PostProcessor]`)
|
||||
/// The processors that need to be chained
|
||||
#[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name = "Sequence")]
|
||||
#[pyo3(text_signature = "(self, processors)")]
|
||||
pub struct PySequence {}
|
||||
#[pymethods]
|
||||
impl PySequence {
|
||||
#[new]
|
||||
#[pyo3(signature = (processors_py))]
|
||||
#[pyo3(signature = (processors_py), text_signature = "(self, processors)")]
|
||||
fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
|
||||
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
|
||||
for n in processors_py.iter() {
|
||||
|
@ -20,6 +20,7 @@ impl From<PyToken> for Token {
|
||||
#[pymethods]
|
||||
impl PyToken {
|
||||
#[new]
|
||||
#[pyo3(text_signature = None)]
|
||||
fn new(id: u32, value: String, offsets: (usize, usize)) -> PyToken {
|
||||
Token::new(id, value, offsets).into()
|
||||
}
|
||||
|
@ -56,9 +56,6 @@ use crate::utils::{MaybeSizedIterator, PyBufferedIterator};
|
||||
/// Yesterday"``.
|
||||
///
|
||||
#[pyclass(dict, module = "tokenizers", name = "AddedToken")]
|
||||
#[pyo3(
|
||||
text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"
|
||||
)]
|
||||
pub struct PyAddedToken {
|
||||
pub content: String,
|
||||
pub is_special_token: bool,
|
||||
@ -128,7 +125,7 @@ impl From<tk::AddedToken> for PyAddedToken {
|
||||
#[pymethods]
|
||||
impl PyAddedToken {
|
||||
#[new]
|
||||
#[pyo3(signature = (content=None, **kwargs))]
|
||||
#[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)")]
|
||||
fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
|
||||
let mut token = PyAddedToken::from(content.unwrap_or(""), None);
|
||||
|
||||
@ -441,7 +438,6 @@ type Tokenizer = TokenizerImpl<PyModel, PyNormalizer, PyPreTokenizer, PyPostProc
|
||||
/// The core algorithm that this :obj:`Tokenizer` should be using.
|
||||
///
|
||||
#[pyclass(dict, module = "tokenizers", name = "Tokenizer")]
|
||||
#[pyo3(text_signature = "(self, model)")]
|
||||
#[derive(Clone)]
|
||||
pub struct PyTokenizer {
|
||||
tokenizer: Tokenizer,
|
||||
@ -460,6 +456,7 @@ impl PyTokenizer {
|
||||
#[pymethods]
|
||||
impl PyTokenizer {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, model)")]
|
||||
fn __new__(model: PyRef<PyModel>) -> Self {
|
||||
PyTokenizer::from_model(model.clone())
|
||||
}
|
||||
|
@ -299,7 +299,7 @@ impl PyBpeTrainer {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (**kwargs))]
|
||||
#[pyo3(signature = (**kwargs), text_signature = None)]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
@ -383,9 +383,6 @@ impl PyBpeTrainer {
|
||||
/// end_of_word_suffix (:obj:`str`, `optional`):
|
||||
/// A suffix to be used for every subword that is a end-of-word.
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "WordPieceTrainer")]
|
||||
#[pyo3(
|
||||
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
|
||||
)]
|
||||
pub struct PyWordPieceTrainer {}
|
||||
#[pymethods]
|
||||
impl PyWordPieceTrainer {
|
||||
@ -506,7 +503,10 @@ impl PyWordPieceTrainer {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (** kwargs))]
|
||||
#[pyo3(
|
||||
signature = (** kwargs),
|
||||
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
|
||||
)]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
@ -646,7 +646,7 @@ impl PyWordLevelTrainer {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (**kwargs))]
|
||||
#[pyo3(signature = (**kwargs), text_signature = None)]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
|
||||
|
||||
@ -731,9 +731,6 @@ impl PyWordLevelTrainer {
|
||||
/// The number of iterations of the EM algorithm to perform before
|
||||
/// pruning the vocabulary.
|
||||
#[pyclass(extends=PyTrainer, module = "tokenizers.trainers", name = "UnigramTrainer")]
|
||||
#[pyo3(
|
||||
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
|
||||
)]
|
||||
pub struct PyUnigramTrainer {}
|
||||
#[pymethods]
|
||||
impl PyUnigramTrainer {
|
||||
@ -814,7 +811,10 @@ impl PyUnigramTrainer {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (**kwargs))]
|
||||
#[pyo3(
|
||||
signature = (**kwargs),
|
||||
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
|
||||
)]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
|
@ -200,6 +200,7 @@ pub struct PyNormalizedString {
|
||||
#[pymethods]
|
||||
impl PyNormalizedString {
|
||||
#[new]
|
||||
#[pyo3(text_signature = None)]
|
||||
fn new(s: &str) -> Self {
|
||||
NormalizedString::from(s).into()
|
||||
}
|
||||
|
@ -148,7 +148,6 @@ fn to_encoding(
|
||||
/// sequence: str:
|
||||
/// The string sequence used to initialize this PreTokenizedString
|
||||
#[pyclass(module = "tokenizers", name = "PreTokenizedString")]
|
||||
#[pyo3(text_signature = "(self, sequence)")]
|
||||
pub struct PyPreTokenizedString {
|
||||
pub(crate) pretok: tk::PreTokenizedString,
|
||||
}
|
||||
@ -168,6 +167,7 @@ impl From<PyPreTokenizedString> for PreTokenizedString {
|
||||
#[pymethods]
|
||||
impl PyPreTokenizedString {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, sequence)")]
|
||||
fn new(s: &str) -> Self {
|
||||
PreTokenizedString::from(s).into()
|
||||
}
|
||||
|
@ -4,7 +4,6 @@ use pyo3::prelude::*;
|
||||
|
||||
/// Instantiate a new Regex with the given pattern
|
||||
#[pyclass(module = "tokenizers", name = "Regex")]
|
||||
#[pyo3(text_signature = "(self, pattern)")]
|
||||
pub struct PyRegex {
|
||||
pub inner: Regex,
|
||||
pub pattern: String,
|
||||
@ -13,6 +12,7 @@ pub struct PyRegex {
|
||||
#[pymethods]
|
||||
impl PyRegex {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, pattern)")]
|
||||
fn new(s: &str) -> PyResult<Self> {
|
||||
Ok(Self {
|
||||
inner: Regex::new(s)
|
||||
|
Reference in New Issue
Block a user