mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
pyo3 v0.18 migration (#1173)
* pyo v0.18 migration * Fix formatting issues of black
This commit is contained in:
@ -14,8 +14,8 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
|
|||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
env_logger = "0.7.1"
|
env_logger = "0.7.1"
|
||||||
pyo3 = "0.17.2"
|
pyo3 = "0.18.1"
|
||||||
numpy = "0.17.2"
|
numpy = "0.18.0"
|
||||||
ndarray = "0.13"
|
ndarray = "0.13"
|
||||||
onig = { version = "6.0", default-features = false }
|
onig = { version = "6.0", default-features = false }
|
||||||
itertools = "0.9"
|
itertools = "0.9"
|
||||||
@ -26,7 +26,7 @@ path = "../../tokenizers"
|
|||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3.1"
|
tempfile = "3.1"
|
||||||
pyo3 = { version = "0.17.2", features = ["auto-initialize"] }
|
pyo3 = { version = "0.18.1", features = ["auto-initialize"] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["pyo3/extension-module"]
|
default = ["pyo3/extension-module"]
|
||||||
|
@ -24,7 +24,7 @@ class JiebaPreTokenizer:
|
|||||||
# Just an odd example...
|
# Just an odd example...
|
||||||
splits = []
|
splits = []
|
||||||
last = 0
|
last = 0
|
||||||
for (i, char) in enumerate(str(normalized_string)):
|
for i, char in enumerate(str(normalized_string)):
|
||||||
if char.isnumeric() and int(char) % 2 == 1:
|
if char.isnumeric() and int(char) % 2 == 1:
|
||||||
splits.append(normalized_string[last:i])
|
splits.append(normalized_string[last:i])
|
||||||
last = i
|
last = i
|
||||||
|
@ -11,6 +11,7 @@ bpe_tokenizer.normalizer = normalizers.Lowercase()
|
|||||||
# Initialize a dataset
|
# Initialize a dataset
|
||||||
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1")
|
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1")
|
||||||
|
|
||||||
|
|
||||||
# Build an iterator over this dataset
|
# Build an iterator over this dataset
|
||||||
def batch_iterator():
|
def batch_iterator():
|
||||||
batch_length = 1000
|
batch_length = 1000
|
||||||
|
@ -26,7 +26,6 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
lowercase: bool = True,
|
lowercase: bool = True,
|
||||||
wordpieces_prefix: str = "##",
|
wordpieces_prefix: str = "##",
|
||||||
):
|
):
|
||||||
|
|
||||||
if vocab is not None:
|
if vocab is not None:
|
||||||
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
|
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
|
||||||
else:
|
else:
|
||||||
|
@ -9,6 +9,17 @@ class Model:
|
|||||||
This class cannot be constructed directly. Please use one of the concrete models.
|
This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def get_trainer(self):
|
||||||
|
"""
|
||||||
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
|
|
||||||
|
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||||
|
:class:`~tokenizers.models.Model`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
|
"""
|
||||||
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -134,6 +145,17 @@ class BPE(Model):
|
|||||||
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
:class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
def get_trainer(self):
|
||||||
|
"""
|
||||||
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
|
|
||||||
|
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||||
|
:class:`~tokenizers.models.Model`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
|
"""
|
||||||
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -222,6 +244,17 @@ class Unigram(Model):
|
|||||||
|
|
||||||
def __init__(self, vocab):
|
def __init__(self, vocab):
|
||||||
pass
|
pass
|
||||||
|
def get_trainer(self):
|
||||||
|
"""
|
||||||
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
|
|
||||||
|
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||||
|
:class:`~tokenizers.models.Model`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
|
"""
|
||||||
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -316,6 +349,17 @@ class WordLevel(Model):
|
|||||||
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
|
:class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
def get_trainer(self):
|
||||||
|
"""
|
||||||
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
|
|
||||||
|
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||||
|
:class:`~tokenizers.models.Model`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
|
"""
|
||||||
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
@ -428,6 +472,17 @@ class WordPiece(Model):
|
|||||||
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
|
:class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
def get_trainer(self):
|
||||||
|
"""
|
||||||
|
Get the associated :class:`~tokenizers.trainers.Trainer`
|
||||||
|
|
||||||
|
Retrieve the :class:`~tokenizers.trainers.Trainer` associated to this
|
||||||
|
:class:`~tokenizers.models.Model`.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
:class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
|
"""
|
||||||
|
pass
|
||||||
def id_to_token(self, id):
|
def id_to_token(self, id):
|
||||||
"""
|
"""
|
||||||
Get the token associated to an ID
|
Get the token associated to an ID
|
||||||
|
@ -149,7 +149,8 @@ pub struct PyByteLevelDec {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyByteLevelDec {
|
impl PyByteLevelDec {
|
||||||
#[new]
|
#[new]
|
||||||
fn new() -> (Self, PyDecoder) {
|
#[pyo3(signature = (**_kwargs))]
|
||||||
|
fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) {
|
||||||
(PyByteLevelDec {}, ByteLevel::default().into())
|
(PyByteLevelDec {}, ByteLevel::default().into())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -189,7 +190,7 @@ impl PyWordPieceDec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(prefix = "String::from(\"##\")", cleanup = "true")]
|
#[pyo3(signature = (prefix = String::from("##"), cleanup = true))]
|
||||||
fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) {
|
fn new(prefix: String, cleanup: bool) -> (Self, PyDecoder) {
|
||||||
(PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into())
|
(PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into())
|
||||||
}
|
}
|
||||||
@ -231,7 +232,7 @@ impl PyMetaspaceDec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(replacement = "PyChar('▁')", add_prefix_space = "true")]
|
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true))]
|
||||||
fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
|
fn new(replacement: PyChar, add_prefix_space: bool) -> (Self, PyDecoder) {
|
||||||
(
|
(
|
||||||
PyMetaspaceDec {},
|
PyMetaspaceDec {},
|
||||||
@ -262,7 +263,7 @@ impl PyBPEDecoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(suffix = "String::from(\"</w>\")")]
|
#[pyo3(signature = (suffix = String::from("</w>")))]
|
||||||
fn new(suffix: String) -> (Self, PyDecoder) {
|
fn new(suffix: String) -> (Self, PyDecoder) {
|
||||||
(PyBPEDecoder {}, BPEDecoder::new(suffix).into())
|
(PyBPEDecoder {}, BPEDecoder::new(suffix).into())
|
||||||
}
|
}
|
||||||
@ -314,11 +315,11 @@ impl PyCTCDecoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(
|
#[pyo3(signature = (
|
||||||
pad_token = "String::from(\"<pad>\")",
|
pad_token = String::from("<pad>"),
|
||||||
word_delimiter_token = "String::from(\"|\")",
|
word_delimiter_token = String::from("|"),
|
||||||
cleanup = "true"
|
cleanup = true
|
||||||
)]
|
))]
|
||||||
fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) {
|
fn new(pad_token: String, word_delimiter_token: String, cleanup: bool) -> (Self, PyDecoder) {
|
||||||
(
|
(
|
||||||
PyCTCDecoder {},
|
PyCTCDecoder {},
|
||||||
@ -338,7 +339,7 @@ pub struct PySequenceDecoder {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PySequenceDecoder {
|
impl PySequenceDecoder {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(decoders)]
|
#[pyo3(signature = (decoders_py))]
|
||||||
fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
|
fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
|
||||||
let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
|
let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
|
||||||
for decoder_py in decoders_py.iter() {
|
for decoder_py in decoders_py.iter() {
|
||||||
|
@ -78,7 +78,7 @@ impl PyEncoding {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.Encoding`: The resulting Encoding
|
/// :class:`~tokenizers.Encoding`: The resulting Encoding
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[args(growing_offsets = true)]
|
#[pyo3(signature = (encodings, growing_offsets = true))]
|
||||||
#[pyo3(text_signature = "(encodings, growing_offsets=True)")]
|
#[pyo3(text_signature = "(encodings, growing_offsets=True)")]
|
||||||
fn merge(encodings: Vec<PyRef<PyEncoding>>, growing_offsets: bool) -> PyEncoding {
|
fn merge(encodings: Vec<PyRef<PyEncoding>>, growing_offsets: bool) -> PyEncoding {
|
||||||
tk::tokenizer::Encoding::merge(
|
tk::tokenizer::Encoding::merge(
|
||||||
@ -263,7 +263,7 @@ impl PyEncoding {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
/// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)`
|
||||||
#[args(sequence_index = 0)]
|
#[pyo3(signature = (word_index, sequence_index = 0))]
|
||||||
#[pyo3(text_signature = "(self, word_index, sequence_index=0)")]
|
#[pyo3(text_signature = "(self, word_index, sequence_index=0)")]
|
||||||
fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
|
fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
|
||||||
self.encoding.word_to_tokens(word_index, sequence_index)
|
self.encoding.word_to_tokens(word_index, sequence_index)
|
||||||
@ -279,7 +279,7 @@ impl PyEncoding {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
/// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)`
|
||||||
#[args(sequence_index = 0)]
|
#[pyo3(signature = (word_index, sequence_index = 0))]
|
||||||
#[pyo3(text_signature = "(self, word_index, sequence_index=0)")]
|
#[pyo3(text_signature = "(self, word_index, sequence_index=0)")]
|
||||||
fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
|
fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
|
||||||
self.encoding.word_to_chars(word_index, sequence_index)
|
self.encoding.word_to_chars(word_index, sequence_index)
|
||||||
@ -347,7 +347,7 @@ impl PyEncoding {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`int`: The index of the token that contains this char in the encoded sequence
|
/// :obj:`int`: The index of the token that contains this char in the encoded sequence
|
||||||
#[args(sequence_index = 0)]
|
#[pyo3(signature = (char_pos, sequence_index = 0))]
|
||||||
#[pyo3(text_signature = "(self, char_pos, sequence_index=0)")]
|
#[pyo3(text_signature = "(self, char_pos, sequence_index=0)")]
|
||||||
fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
|
fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
|
||||||
self.encoding.char_to_token(char_pos, sequence_index)
|
self.encoding.char_to_token(char_pos, sequence_index)
|
||||||
@ -363,7 +363,7 @@ impl PyEncoding {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`int`: The index of the word that contains this char in the input sequence
|
/// :obj:`int`: The index of the word that contains this char in the input sequence
|
||||||
#[args(sequence_index = 0)]
|
#[pyo3(signature = (char_pos, sequence_index = 0))]
|
||||||
#[pyo3(text_signature = "(self, char_pos, sequence_index=0)")]
|
#[pyo3(text_signature = "(self, char_pos, sequence_index=0)")]
|
||||||
fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
|
fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
|
||||||
self.encoding.char_to_word(char_pos, sequence_index)
|
self.encoding.char_to_word(char_pos, sequence_index)
|
||||||
@ -386,7 +386,7 @@ impl PyEncoding {
|
|||||||
///
|
///
|
||||||
/// pad_token (:obj:`str`, defaults to `[PAD]`):
|
/// pad_token (:obj:`str`, defaults to `[PAD]`):
|
||||||
/// The pad token to use
|
/// The pad token to use
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (length, **kwargs))]
|
||||||
#[pyo3(
|
#[pyo3(
|
||||||
text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"
|
text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"
|
||||||
)]
|
)]
|
||||||
@ -439,8 +439,7 @@ impl PyEncoding {
|
|||||||
///
|
///
|
||||||
/// direction (:obj:`str`, defaults to :obj:`right`):
|
/// direction (:obj:`str`, defaults to :obj:`right`):
|
||||||
/// Truncate direction
|
/// Truncate direction
|
||||||
#[args(stride = "0")]
|
#[pyo3(signature = (max_length, stride = 0, direction = "right"))]
|
||||||
#[args(direction = "\"right\"")]
|
|
||||||
#[pyo3(text_signature = "(self, max_length, stride=0, direction='right')")]
|
#[pyo3(text_signature = "(self, max_length, stride=0, direction='right')")]
|
||||||
fn truncate(&mut self, max_length: usize, stride: usize, direction: &str) -> PyResult<()> {
|
fn truncate(&mut self, max_length: usize, stride: usize, direction: &str) -> PyResult<()> {
|
||||||
let tdir = match direction {
|
let tdir = match direction {
|
||||||
|
@ -215,6 +215,7 @@ impl PyModel {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
/// :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model
|
||||||
|
#[pyo3(text_signature = "(self)")]
|
||||||
fn get_trainer(&self, py: Python<'_>) -> PyResult<PyObject> {
|
fn get_trainer(&self, py: Python<'_>) -> PyResult<PyObject> {
|
||||||
PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype(py)
|
PyTrainer::from(self.model.read().unwrap().get_trainer()).get_as_subtype(py)
|
||||||
}
|
}
|
||||||
@ -385,7 +386,7 @@ impl PyBPE {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (vocab=None, merges=None, **kwargs))]
|
||||||
fn new(
|
fn new(
|
||||||
py: Python<'_>,
|
py: Python<'_>,
|
||||||
vocab: Option<PyVocab>,
|
vocab: Option<PyVocab>,
|
||||||
@ -472,7 +473,7 @@ impl PyBPE {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
/// :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files
|
||||||
#[classmethod]
|
#[classmethod]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (vocab, merges, **kwargs))]
|
||||||
#[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")]
|
#[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")]
|
||||||
fn from_file(
|
fn from_file(
|
||||||
_cls: &PyType,
|
_cls: &PyType,
|
||||||
@ -582,7 +583,7 @@ impl PyWordPiece {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (vocab=None, **kwargs))]
|
||||||
fn new(
|
fn new(
|
||||||
py: Python<'_>,
|
py: Python<'_>,
|
||||||
vocab: Option<PyVocab>,
|
vocab: Option<PyVocab>,
|
||||||
@ -648,7 +649,7 @@ impl PyWordPiece {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
|
/// :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file
|
||||||
#[classmethod]
|
#[classmethod]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (vocab, **kwargs))]
|
||||||
#[pyo3(text_signature = "(vocab, **kwargs)")]
|
#[pyo3(text_signature = "(vocab, **kwargs)")]
|
||||||
fn from_file(
|
fn from_file(
|
||||||
_cls: &PyType,
|
_cls: &PyType,
|
||||||
@ -693,7 +694,7 @@ impl PyWordLevel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(unk_token = "None")]
|
#[pyo3(signature = (vocab=None, unk_token = None))]
|
||||||
fn new(
|
fn new(
|
||||||
py: Python<'_>,
|
py: Python<'_>,
|
||||||
vocab: Option<PyVocab>,
|
vocab: Option<PyVocab>,
|
||||||
@ -768,7 +769,7 @@ impl PyWordLevel {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
|
/// :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file
|
||||||
#[classmethod]
|
#[classmethod]
|
||||||
#[args(unk_token = "None")]
|
#[pyo3(signature = (vocab, unk_token = None))]
|
||||||
#[pyo3(text_signature = "(vocab, unk_token)")]
|
#[pyo3(text_signature = "(vocab, unk_token)")]
|
||||||
fn from_file(
|
fn from_file(
|
||||||
_cls: &PyType,
|
_cls: &PyType,
|
||||||
|
@ -267,12 +267,12 @@ impl PyBertNormalizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(
|
#[pyo3(signature = (
|
||||||
clean_text = "true",
|
clean_text = true,
|
||||||
handle_chinese_chars = "true",
|
handle_chinese_chars = true,
|
||||||
strip_accents = "None",
|
strip_accents = None,
|
||||||
lowercase = "true"
|
lowercase = true
|
||||||
)]
|
))]
|
||||||
fn new(
|
fn new(
|
||||||
clean_text: bool,
|
clean_text: bool,
|
||||||
handle_chinese_chars: bool,
|
handle_chinese_chars: bool,
|
||||||
@ -407,7 +407,7 @@ impl PyStrip {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(left = "true", right = "true")]
|
#[pyo3(signature = (left = true, right = true))]
|
||||||
fn new(left: bool, right: bool) -> (Self, PyNormalizer) {
|
fn new(left: bool, right: bool) -> (Self, PyNormalizer) {
|
||||||
(PyStrip {}, Strip::new(left, right).into())
|
(PyStrip {}, Strip::new(left, right).into())
|
||||||
}
|
}
|
||||||
|
@ -260,7 +260,7 @@ impl PyByteLevel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(add_prefix_space = "true", use_regex = "true", _kwargs = "**")]
|
#[pyo3(signature = (add_prefix_space = true, use_regex = true, **_kwargs))]
|
||||||
fn new(
|
fn new(
|
||||||
add_prefix_space: bool,
|
add_prefix_space: bool,
|
||||||
use_regex: bool,
|
use_regex: bool,
|
||||||
@ -340,7 +340,7 @@ pub struct PySplit {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PySplit {
|
impl PySplit {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(invert = false)]
|
#[pyo3(signature = (pattern, behavior, invert = false))]
|
||||||
fn new(
|
fn new(
|
||||||
pattern: PyPattern,
|
pattern: PyPattern,
|
||||||
behavior: PySplitDelimiterBehavior,
|
behavior: PySplitDelimiterBehavior,
|
||||||
@ -419,7 +419,7 @@ pub struct PyPunctuation {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyPunctuation {
|
impl PyPunctuation {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(behavior = "PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)")]
|
#[pyo3( signature = (behavior = PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)))]
|
||||||
fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
|
fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
|
||||||
(PyPunctuation {}, Punctuation::new(behavior.into()).into())
|
(PyPunctuation {}, Punctuation::new(behavior.into()).into())
|
||||||
}
|
}
|
||||||
@ -493,7 +493,7 @@ impl PyMetaspace {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(replacement = "PyChar('▁')", add_prefix_space = "true", _kwargs = "**")]
|
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs))]
|
||||||
fn new(
|
fn new(
|
||||||
replacement: PyChar,
|
replacement: PyChar,
|
||||||
add_prefix_space: bool,
|
add_prefix_space: bool,
|
||||||
@ -533,7 +533,7 @@ impl PyDigits {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(individual_digits = false)]
|
#[pyo3(signature = (individual_digits = false))]
|
||||||
fn new(individual_digits: bool) -> (Self, PyPreTokenizer) {
|
fn new(individual_digits: bool) -> (Self, PyPreTokenizer) {
|
||||||
(PyDigits {}, Digits::new(individual_digits).into())
|
(PyDigits {}, Digits::new(individual_digits).into())
|
||||||
}
|
}
|
||||||
|
@ -123,7 +123,7 @@ impl PyPostProcessor {
|
|||||||
///
|
///
|
||||||
/// Return:
|
/// Return:
|
||||||
/// :class:`~tokenizers.Encoding`: The final encoding
|
/// :class:`~tokenizers.Encoding`: The final encoding
|
||||||
#[args(pair = "None", add_special_tokens = "true")]
|
#[pyo3(signature = (encoding, pair = None, add_special_tokens = true))]
|
||||||
#[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")]
|
#[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")]
|
||||||
fn process(
|
fn process(
|
||||||
&self,
|
&self,
|
||||||
@ -201,7 +201,7 @@ pub struct PyRobertaProcessing {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyRobertaProcessing {
|
impl PyRobertaProcessing {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(trim_offsets = true, add_prefix_space = true)]
|
#[pyo3(signature = (sep, cls, trim_offsets = true, add_prefix_space = true))]
|
||||||
fn new(
|
fn new(
|
||||||
sep: (String, u32),
|
sep: (String, u32),
|
||||||
cls: (String, u32),
|
cls: (String, u32),
|
||||||
@ -236,7 +236,7 @@ pub struct PyByteLevel {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyByteLevel {
|
impl PyByteLevel {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(trim_offsets = "None", _kwargs = "**")]
|
#[pyo3(signature = (trim_offsets = None, **_kwargs))]
|
||||||
fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
|
fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
|
||||||
let mut byte_level = ByteLevel::default();
|
let mut byte_level = ByteLevel::default();
|
||||||
|
|
||||||
@ -388,7 +388,7 @@ pub struct PyTemplateProcessing {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyTemplateProcessing {
|
impl PyTemplateProcessing {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(single = "None", pair = "None", special_tokens = "None")]
|
#[pyo3(signature = (single = None, pair = None, special_tokens = None))]
|
||||||
fn new(
|
fn new(
|
||||||
single: Option<PyTemplate>,
|
single: Option<PyTemplate>,
|
||||||
pair: Option<PyTemplate>,
|
pair: Option<PyTemplate>,
|
||||||
@ -427,7 +427,7 @@ pub struct PySequence {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PySequence {
|
impl PySequence {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(processors)]
|
#[pyo3(signature = (processors_py))]
|
||||||
fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
|
fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
|
||||||
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
|
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
|
||||||
for n in processors_py.iter() {
|
for n in processors_py.iter() {
|
||||||
|
@ -128,7 +128,7 @@ impl From<tk::AddedToken> for PyAddedToken {
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyAddedToken {
|
impl PyAddedToken {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (content=None, **kwargs))]
|
||||||
fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
|
fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
|
||||||
let mut token = PyAddedToken::from(content.unwrap_or(""), None);
|
let mut token = PyAddedToken::from(content.unwrap_or(""), None);
|
||||||
|
|
||||||
@ -308,7 +308,7 @@ impl FromPyObject<'_> for PyArrayUnicode {
|
|||||||
);
|
);
|
||||||
let py = ob.py();
|
let py = ob.py();
|
||||||
let obj = PyObject::from_owned_ptr(py, unicode);
|
let obj = PyObject::from_owned_ptr(py, unicode);
|
||||||
let s = obj.cast_as::<PyString>(py)?;
|
let s = obj.downcast::<PyString>(py)?;
|
||||||
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
||||||
})
|
})
|
||||||
.collect::<PyResult<Vec<_>>>()?;
|
.collect::<PyResult<Vec<_>>>()?;
|
||||||
@ -332,7 +332,7 @@ impl FromPyObject<'_> for PyArrayStr {
|
|||||||
.as_array()
|
.as_array()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|obj| {
|
.map(|obj| {
|
||||||
let s = obj.cast_as::<PyString>(ob.py())?;
|
let s = obj.downcast::<PyString>(ob.py())?;
|
||||||
Ok(s.to_string_lossy().into_owned())
|
Ok(s.to_string_lossy().into_owned())
|
||||||
})
|
})
|
||||||
.collect::<PyResult<Vec<_>>>()?;
|
.collect::<PyResult<Vec<_>>>()?;
|
||||||
@ -562,7 +562,7 @@ impl PyTokenizer {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
|
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[args(revision = "String::from(\"main\")", auth_token = "None")]
|
#[pyo3(signature = (identifier, revision = String::from("main"), auth_token = None))]
|
||||||
#[pyo3(text_signature = "(identifier, revision=\"main\", auth_token=None)")]
|
#[pyo3(text_signature = "(identifier, revision=\"main\", auth_token=None)")]
|
||||||
fn from_pretrained(
|
fn from_pretrained(
|
||||||
identifier: &str,
|
identifier: &str,
|
||||||
@ -591,7 +591,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`str`: A string representing the serialized Tokenizer
|
/// :obj:`str`: A string representing the serialized Tokenizer
|
||||||
#[args(pretty = false)]
|
#[pyo3(signature = (pretty = false))]
|
||||||
#[pyo3(text_signature = "(self, pretty=False)")]
|
#[pyo3(text_signature = "(self, pretty=False)")]
|
||||||
fn to_str(&self, pretty: bool) -> PyResult<String> {
|
fn to_str(&self, pretty: bool) -> PyResult<String> {
|
||||||
ToPyResult(self.tokenizer.to_string(pretty)).into()
|
ToPyResult(self.tokenizer.to_string(pretty)).into()
|
||||||
@ -605,7 +605,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// pretty (:obj:`bool`, defaults to :obj:`True`):
|
/// pretty (:obj:`bool`, defaults to :obj:`True`):
|
||||||
/// Whether the JSON file should be pretty formatted.
|
/// Whether the JSON file should be pretty formatted.
|
||||||
#[args(pretty = true)]
|
#[pyo3(signature = (path, pretty = true))]
|
||||||
#[pyo3(text_signature = "(self, path, pretty=True)")]
|
#[pyo3(text_signature = "(self, path, pretty=True)")]
|
||||||
fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
|
fn save(&self, path: &str, pretty: bool) -> PyResult<()> {
|
||||||
ToPyResult(self.tokenizer.save(path, pretty)).into()
|
ToPyResult(self.tokenizer.save(path, pretty)).into()
|
||||||
@ -629,7 +629,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`Dict[str, int]`: The vocabulary
|
/// :obj:`Dict[str, int]`: The vocabulary
|
||||||
#[args(with_added_tokens = true)]
|
#[pyo3(signature = (with_added_tokens = true))]
|
||||||
#[pyo3(text_signature = "(self, with_added_tokens=True)")]
|
#[pyo3(text_signature = "(self, with_added_tokens=True)")]
|
||||||
fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32> {
|
fn get_vocab(&self, with_added_tokens: bool) -> HashMap<String, u32> {
|
||||||
self.tokenizer.get_vocab(with_added_tokens)
|
self.tokenizer.get_vocab(with_added_tokens)
|
||||||
@ -643,7 +643,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`int`: The size of the vocabulary
|
/// :obj:`int`: The size of the vocabulary
|
||||||
#[args(with_added_tokens = true)]
|
#[pyo3(signature = (with_added_tokens = true))]
|
||||||
#[pyo3(text_signature = "(self, with_added_tokens=True)")]
|
#[pyo3(text_signature = "(self, with_added_tokens=True)")]
|
||||||
fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
||||||
self.tokenizer.get_vocab_size(with_added_tokens)
|
self.tokenizer.get_vocab_size(with_added_tokens)
|
||||||
@ -665,7 +665,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// direction (:obj:`str`, defaults to :obj:`right`):
|
/// direction (:obj:`str`, defaults to :obj:`right`):
|
||||||
/// Truncate direction
|
/// Truncate direction
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (max_length, **kwargs))]
|
||||||
#[pyo3(
|
#[pyo3(
|
||||||
text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
|
text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
|
||||||
)]
|
)]
|
||||||
@ -767,7 +767,7 @@ impl PyTokenizer {
|
|||||||
/// length (:obj:`int`, `optional`):
|
/// length (:obj:`int`, `optional`):
|
||||||
/// If specified, the length at which to pad. If not specified we pad using the size of
|
/// If specified, the length at which to pad. If not specified we pad using the size of
|
||||||
/// the longest sequence in a batch.
|
/// the longest sequence in a batch.
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (**kwargs))]
|
||||||
#[pyo3(
|
#[pyo3(
|
||||||
text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
|
text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
|
||||||
)]
|
)]
|
||||||
@ -896,7 +896,7 @@ impl PyTokenizer {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.Encoding`: The encoded result
|
/// :class:`~tokenizers.Encoding`: The encoded result
|
||||||
///
|
///
|
||||||
#[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")]
|
#[pyo3(signature = (sequence, pair = None, is_pretokenized = false, add_special_tokens = true))]
|
||||||
#[pyo3(
|
#[pyo3(
|
||||||
text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"
|
text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"
|
||||||
)]
|
)]
|
||||||
@ -963,7 +963,7 @@ impl PyTokenizer {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
/// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||||
///
|
///
|
||||||
#[args(is_pretokenized = "false", add_special_tokens = "true")]
|
#[pyo3(signature = (input, is_pretokenized = false, add_special_tokens = true))]
|
||||||
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
|
#[pyo3(text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)")]
|
||||||
fn encode_batch(
|
fn encode_batch(
|
||||||
&self,
|
&self,
|
||||||
@ -1006,7 +1006,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`str`: The decoded string
|
/// :obj:`str`: The decoded string
|
||||||
#[args(skip_special_tokens = true)]
|
#[pyo3(signature = (ids, skip_special_tokens = true))]
|
||||||
#[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")]
|
#[pyo3(text_signature = "(self, ids, skip_special_tokens=True)")]
|
||||||
fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
|
fn decode(&self, ids: Vec<u32>, skip_special_tokens: bool) -> PyResult<String> {
|
||||||
ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
|
ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into()
|
||||||
@ -1023,7 +1023,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`List[str]`: A list of decoded strings
|
/// :obj:`List[str]`: A list of decoded strings
|
||||||
#[args(skip_special_tokens = true)]
|
#[pyo3(signature = (sequences, skip_special_tokens = true))]
|
||||||
#[pyo3(text_signature = "(self, sequences, skip_special_tokens=True)")]
|
#[pyo3(text_signature = "(self, sequences, skip_special_tokens=True)")]
|
||||||
fn decode_batch(
|
fn decode_batch(
|
||||||
&self,
|
&self,
|
||||||
@ -1144,7 +1144,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
|
/// trainer (:obj:`~tokenizers.trainers.Trainer`, `optional`):
|
||||||
/// An optional trainer that should be used to train our Model
|
/// An optional trainer that should be used to train our Model
|
||||||
#[args(trainer = "None")]
|
#[pyo3(signature = (files, trainer = None))]
|
||||||
#[pyo3(text_signature = "(self, files, trainer = None)")]
|
#[pyo3(text_signature = "(self, files, trainer = None)")]
|
||||||
fn train(&mut self, files: Vec<String>, trainer: Option<&mut PyTrainer>) -> PyResult<()> {
|
fn train(&mut self, files: Vec<String>, trainer: Option<&mut PyTrainer>) -> PyResult<()> {
|
||||||
let mut trainer =
|
let mut trainer =
|
||||||
@ -1180,7 +1180,7 @@ impl PyTokenizer {
|
|||||||
/// length (:obj:`int`, `optional`):
|
/// length (:obj:`int`, `optional`):
|
||||||
/// The total number of sequences in the iterator. This is used to
|
/// The total number of sequences in the iterator. This is used to
|
||||||
/// provide meaningful progress tracking
|
/// provide meaningful progress tracking
|
||||||
#[args(trainer = "None", length = "None")]
|
#[pyo3(signature = (iterator, trainer = None, length = None))]
|
||||||
#[pyo3(text_signature = "(self, iterator, trainer=None, length=None)")]
|
#[pyo3(text_signature = "(self, iterator, trainer=None, length=None)")]
|
||||||
fn train_from_iterator(
|
fn train_from_iterator(
|
||||||
&mut self,
|
&mut self,
|
||||||
@ -1246,7 +1246,7 @@ impl PyTokenizer {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// :class:`~tokenizers.Encoding`: The final post-processed encoding
|
/// :class:`~tokenizers.Encoding`: The final post-processed encoding
|
||||||
#[args(pair = "None", add_special_tokens = true)]
|
#[pyo3(signature = (encoding, pair = None, add_special_tokens = true))]
|
||||||
#[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")]
|
#[pyo3(text_signature = "(self, encoding, pair=None, add_special_tokens=True)")]
|
||||||
fn post_process(
|
fn post_process(
|
||||||
&self,
|
&self,
|
||||||
|
@ -283,7 +283,7 @@ impl PyBpeTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (**kwargs))]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||||
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
@ -295,7 +295,7 @@ impl PyBpeTrainer {
|
|||||||
"show_progress" => builder = builder.show_progress(val.extract()?),
|
"show_progress" => builder = builder.show_progress(val.extract()?),
|
||||||
"special_tokens" => {
|
"special_tokens" => {
|
||||||
builder = builder.special_tokens(
|
builder = builder.special_tokens(
|
||||||
val.cast_as::<PyList>()?
|
val.downcast::<PyList>()?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|token| {
|
.map(|token| {
|
||||||
if let Ok(content) = token.extract::<String>() {
|
if let Ok(content) = token.extract::<String>() {
|
||||||
@ -489,7 +489,7 @@ impl PyWordPieceTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (** kwargs))]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||||
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
@ -501,7 +501,7 @@ impl PyWordPieceTrainer {
|
|||||||
"show_progress" => builder = builder.show_progress(val.extract()?),
|
"show_progress" => builder = builder.show_progress(val.extract()?),
|
||||||
"special_tokens" => {
|
"special_tokens" => {
|
||||||
builder = builder.special_tokens(
|
builder = builder.special_tokens(
|
||||||
val.cast_as::<PyList>()?
|
val.downcast::<PyList>()?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|token| {
|
.map(|token| {
|
||||||
if let Ok(content) = token.extract::<String>() {
|
if let Ok(content) = token.extract::<String>() {
|
||||||
@ -629,7 +629,7 @@ impl PyWordLevelTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (**kwargs))]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||||
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
|
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
|
||||||
|
|
||||||
@ -648,7 +648,7 @@ impl PyWordLevelTrainer {
|
|||||||
}
|
}
|
||||||
"special_tokens" => {
|
"special_tokens" => {
|
||||||
builder.special_tokens(
|
builder.special_tokens(
|
||||||
val.cast_as::<PyList>()?
|
val.downcast::<PyList>()?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|token| {
|
.map(|token| {
|
||||||
if let Ok(content) = token.extract::<String>() {
|
if let Ok(content) = token.extract::<String>() {
|
||||||
@ -797,7 +797,7 @@ impl PyUnigramTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[pyo3(signature = (**kwargs))]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||||
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
@ -821,7 +821,7 @@ impl PyUnigramTrainer {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
"special_tokens" => builder.special_tokens(
|
"special_tokens" => builder.special_tokens(
|
||||||
val.cast_as::<PyList>()?
|
val.downcast::<PyList>()?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|token| {
|
.map(|token| {
|
||||||
if let Ok(content) = token.extract::<String>() {
|
if let Ok(content) = token.extract::<String>() {
|
||||||
|
@ -223,7 +223,7 @@ impl PyPreTokenizedString {
|
|||||||
///
|
///
|
||||||
/// Returns:
|
/// Returns:
|
||||||
/// An Encoding
|
/// An Encoding
|
||||||
#[args(type_id = "0", word_idx = "None")]
|
#[pyo3(signature = (type_id = 0, word_idx = None))]
|
||||||
#[pyo3(text_signature = "(self, type_id=0, word_idx=None)")]
|
#[pyo3(text_signature = "(self, type_id=0, word_idx=None)")]
|
||||||
fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> {
|
fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> {
|
||||||
to_encoding(&self.pretok, type_id, word_idx)
|
to_encoding(&self.pretok, type_id, word_idx)
|
||||||
@ -245,10 +245,10 @@ impl PyPreTokenizedString {
|
|||||||
///
|
///
|
||||||
/// Returns
|
/// Returns
|
||||||
/// A list of splits
|
/// A list of splits
|
||||||
#[args(
|
#[pyo3(signature = (
|
||||||
offset_referential = "PyOffsetReferential(OffsetReferential::Original)",
|
offset_referential = PyOffsetReferential(OffsetReferential::Original),
|
||||||
offset_type = "PyOffsetType(OffsetType::Char)"
|
offset_type = PyOffsetType(OffsetType::Char)
|
||||||
)]
|
))]
|
||||||
#[pyo3(text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")")]
|
#[pyo3(text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")")]
|
||||||
fn get_splits(
|
fn get_splits(
|
||||||
&self,
|
&self,
|
||||||
@ -307,17 +307,17 @@ impl PyPreTokenizedStringRefMut {
|
|||||||
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
||||||
}
|
}
|
||||||
|
|
||||||
#[args(type_id = "0", word_idx = "None")]
|
#[pyo3(signature = (type_id = 0, word_idx = None))]
|
||||||
fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> {
|
fn to_encoding(&self, type_id: u32, word_idx: Option<u32>) -> PyResult<PyEncoding> {
|
||||||
self.inner
|
self.inner
|
||||||
.map(|pretok| to_encoding(pretok, type_id, word_idx))
|
.map(|pretok| to_encoding(pretok, type_id, word_idx))
|
||||||
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
||||||
}
|
}
|
||||||
|
|
||||||
#[args(
|
#[pyo3(signature = (
|
||||||
offset_referential = "PyOffsetReferential(OffsetReferential::Original)",
|
offset_referential = PyOffsetReferential(OffsetReferential::Original),
|
||||||
offset_type = "PyOffsetType(OffsetType::Char)"
|
offset_type = PyOffsetType(OffsetType::Char)
|
||||||
)]
|
))]
|
||||||
fn get_splits(
|
fn get_splits(
|
||||||
&self,
|
&self,
|
||||||
offset_referential: PyOffsetReferential,
|
offset_referential: PyOffsetReferential,
|
||||||
|
@ -65,7 +65,7 @@ class TestFullDeserialization(unittest.TestCase):
|
|||||||
# all_models.append((model_id, filename))
|
# all_models.append((model_id, filename))
|
||||||
|
|
||||||
all_models = [("HueyNemud/das22-10-camembert_pretrained", "tokenizer.json")]
|
all_models = [("HueyNemud/das22-10-camembert_pretrained", "tokenizer.json")]
|
||||||
for (model_id, filename) in tqdm.tqdm(all_models):
|
for model_id, filename in tqdm.tqdm(all_models):
|
||||||
tokenizer_file = cached_download(hf_hub_url(model_id, filename=filename))
|
tokenizer_file = cached_download(hf_hub_url(model_id, filename=filename))
|
||||||
|
|
||||||
is_ok = check(tokenizer_file)
|
is_ok = check(tokenizer_file)
|
||||||
|
Reference in New Issue
Block a user