mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
@ -9,24 +9,24 @@ name = "tokenizers"
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
rayon = "1.8"
|
||||
rayon = "1.10"
|
||||
serde = { version = "1.0", features = [ "rc", "derive" ]}
|
||||
serde_json = "1.0"
|
||||
libc = "0.2"
|
||||
env_logger = "0.10.0"
|
||||
pyo3 = { version = "0.20" }
|
||||
numpy = "0.20.0"
|
||||
env_logger = "0.11"
|
||||
pyo3 = { version = "0.21" }
|
||||
numpy = "0.21"
|
||||
ndarray = "0.15"
|
||||
onig = { version = "6.4", default-features = false }
|
||||
itertools = "0.11"
|
||||
itertools = "0.12"
|
||||
|
||||
[dependencies.tokenizers]
|
||||
version = "0.16.0-dev.0"
|
||||
path = "../../tokenizers"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.8"
|
||||
pyo3 = { version = "0.20", features = ["auto-initialize"] }
|
||||
tempfile = "3.10"
|
||||
pyo3 = { version = "0.21", features = ["auto-initialize"] }
|
||||
|
||||
[features]
|
||||
defaut = ["pyo3/extension-module"]
|
||||
|
@ -1,7 +1,6 @@
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use crate::pre_tokenizers::from_string;
|
||||
use crate::utils::PyChar;
|
||||
use crate::utils::PyPattern;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
@ -85,7 +84,7 @@ impl PyDecoder {
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
||||
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||
}
|
||||
|
||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||
@ -161,7 +160,7 @@ pub struct PyByteLevelDec {}
|
||||
impl PyByteLevelDec {
|
||||
#[new]
|
||||
#[pyo3(signature = (**_kwargs), text_signature = "(self)")]
|
||||
fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) {
|
||||
fn new(_kwargs: Option<&Bound<'_, PyDict>>) -> (Self, PyDecoder) {
|
||||
(PyByteLevelDec {}, ByteLevel::default().into())
|
||||
}
|
||||
}
|
||||
@ -318,8 +317,8 @@ impl PyMetaspaceDec {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_replacement(self_: PyRef<Self>, replacement: PyChar) {
|
||||
setter!(self_, Metaspace, @set_replacement, replacement.0);
|
||||
fn set_replacement(self_: PyRef<Self>, replacement: char) {
|
||||
setter!(self_, Metaspace, @set_replacement, replacement);
|
||||
}
|
||||
|
||||
#[getter]
|
||||
@ -352,16 +351,12 @@ impl PyMetaspaceDec {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")]
|
||||
fn new(
|
||||
replacement: PyChar,
|
||||
prepend_scheme: String,
|
||||
split: bool,
|
||||
) -> PyResult<(Self, PyDecoder)> {
|
||||
#[pyo3(signature = (replacement = '▁', prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")]
|
||||
fn new(replacement: char, prepend_scheme: String, split: bool) -> PyResult<(Self, PyDecoder)> {
|
||||
let prepend_scheme = from_string(prepend_scheme)?;
|
||||
Ok((
|
||||
PyMetaspaceDec {},
|
||||
Metaspace::new(replacement.0, prepend_scheme, split).into(),
|
||||
Metaspace::new(replacement, prepend_scheme, split).into(),
|
||||
))
|
||||
}
|
||||
}
|
||||
@ -463,7 +458,7 @@ pub struct PySequenceDecoder {}
|
||||
impl PySequenceDecoder {
|
||||
#[new]
|
||||
#[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")]
|
||||
fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
|
||||
fn new(decoders_py: &Bound<'_, PyList>) -> PyResult<(Self, PyDecoder)> {
|
||||
let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
|
||||
for decoder_py in decoders_py.iter() {
|
||||
let decoder: PyRef<PyDecoder> = decoder_py.extract()?;
|
||||
@ -476,8 +471,8 @@ impl PySequenceDecoder {
|
||||
Ok((PySequenceDecoder {}, Sequence::new(decoders).into()))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, [PyList::empty(py)])
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
PyTuple::new_bound(py, [PyList::empty_bound(py)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -497,7 +492,7 @@ impl Decoder for CustomDecoder {
|
||||
Python::with_gil(|py| {
|
||||
let decoded = self
|
||||
.inner
|
||||
.call_method(py, "decode", (tokens,), None)?
|
||||
.call_method_bound(py, "decode", (tokens,), None)?
|
||||
.extract(py)?;
|
||||
Ok(decoded)
|
||||
})
|
||||
@ -507,7 +502,7 @@ impl Decoder for CustomDecoder {
|
||||
Python::with_gil(|py| {
|
||||
let decoded = self
|
||||
.inner
|
||||
.call_method(py, "decode_chain", (tokens,), None)?
|
||||
.call_method_bound(py, "decode_chain", (tokens,), None)?
|
||||
.extract(py)?;
|
||||
Ok(decoded)
|
||||
})
|
||||
@ -572,7 +567,7 @@ impl Decoder for PyDecoderWrapper {
|
||||
|
||||
/// Decoders Module
|
||||
#[pymodule]
|
||||
pub fn decoders(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<PyDecoder>()?;
|
||||
m.add_class::<PyByteLevelDec>()?;
|
||||
m.add_class::<PyReplaceDec>()?;
|
||||
@ -602,7 +597,7 @@ mod test {
|
||||
Python::with_gil(|py| {
|
||||
let py_dec = PyDecoder::new(Metaspace::default().into());
|
||||
let py_meta = py_dec.get_as_subtype(py).unwrap();
|
||||
assert_eq!("Metaspace", py_meta.as_ref(py).get_type().name().unwrap());
|
||||
assert_eq!("Metaspace", py_meta.bind(py).get_type().qualname().unwrap());
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -37,7 +37,7 @@ impl PyEncoding {
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
||||
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||
}
|
||||
|
||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||
@ -391,10 +391,10 @@ impl PyEncoding {
|
||||
#[pyo3(
|
||||
text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"
|
||||
)]
|
||||
fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
fn pad(&mut self, length: usize, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<()> {
|
||||
let mut pad_id = 0;
|
||||
let mut pad_type_id = 0;
|
||||
let mut pad_token = "[PAD]";
|
||||
let mut pad_token = "[PAD]".to_string();
|
||||
let mut direction = PaddingDirection::Right;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
@ -422,7 +422,7 @@ impl PyEncoding {
|
||||
}
|
||||
}
|
||||
self.encoding
|
||||
.pad(length, pad_id, pad_type_id, pad_token, direction);
|
||||
.pad(length, pad_id, pad_type_id, &pad_token, direction);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -35,7 +35,7 @@ impl<T> ToPyResult<T> {
|
||||
}
|
||||
|
||||
pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> {
|
||||
let deprecation_warning = py.import("builtins")?.getattr("DeprecationWarning")?;
|
||||
let deprecation_warning = py.import_bound("builtins")?.getattr("DeprecationWarning")?;
|
||||
let full_message = format!("Deprecated in {}: {}", version, message);
|
||||
pyo3::PyErr::warn(py, deprecation_warning, &full_message, 0)
|
||||
pyo3::PyErr::warn_bound(py, &deprecation_warning, &full_message, 0)
|
||||
}
|
||||
|
@ -47,7 +47,7 @@ extern "C" fn child_after_fork() {
|
||||
|
||||
/// Tokenizers Module
|
||||
#[pymodule]
|
||||
pub fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
pub fn tokenizers(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
let _ = env_logger::try_init_from_env("TOKENIZERS_LOG");
|
||||
|
||||
// Register the fork callback
|
||||
|
@ -105,7 +105,7 @@ impl PyModel {
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
||||
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||
}
|
||||
|
||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||
@ -260,7 +260,10 @@ impl PyModel {
|
||||
pub struct PyBPE {}
|
||||
|
||||
impl PyBPE {
|
||||
fn with_builder(mut builder: BpeBuilder, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
||||
fn with_builder(
|
||||
mut builder: BpeBuilder,
|
||||
kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
@ -321,14 +324,14 @@ macro_rules! setter {
|
||||
}
|
||||
|
||||
#[derive(FromPyObject)]
|
||||
enum PyVocab<'a> {
|
||||
enum PyVocab {
|
||||
Vocab(Vocab),
|
||||
Filename(&'a str),
|
||||
Filename(String),
|
||||
}
|
||||
#[derive(FromPyObject)]
|
||||
enum PyMerges<'a> {
|
||||
enum PyMerges {
|
||||
Merges(Merges),
|
||||
Filename(&'a str),
|
||||
Filename(String),
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
@ -417,7 +420,7 @@ impl PyBPE {
|
||||
py: Python<'_>,
|
||||
vocab: Option<PyVocab>,
|
||||
merges: Option<PyMerges>,
|
||||
kwargs: Option<&PyDict>,
|
||||
kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
|
||||
return Err(exceptions::PyValueError::new_err(
|
||||
@ -502,11 +505,11 @@ impl PyBPE {
|
||||
#[pyo3(signature = (vocab, merges, **kwargs))]
|
||||
#[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")]
|
||||
fn from_file(
|
||||
_cls: &PyType,
|
||||
_cls: &Bound<'_, PyType>,
|
||||
py: Python,
|
||||
vocab: &str,
|
||||
merges: &str,
|
||||
kwargs: Option<&PyDict>,
|
||||
kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<Py<Self>> {
|
||||
let (vocab, merges) = BPE::read_file(vocab, merges).map_err(|e| {
|
||||
exceptions::PyException::new_err(format!("Error while reading BPE files: {}", e))
|
||||
@ -540,7 +543,7 @@ pub struct PyWordPiece {}
|
||||
impl PyWordPiece {
|
||||
fn with_builder(
|
||||
mut builder: WordPieceBuilder,
|
||||
kwargs: Option<&PyDict>,
|
||||
kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
@ -612,7 +615,7 @@ impl PyWordPiece {
|
||||
fn new(
|
||||
py: Python<'_>,
|
||||
vocab: Option<PyVocab>,
|
||||
kwargs: Option<&PyDict>,
|
||||
kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
let mut builder = WordPiece::builder();
|
||||
|
||||
@ -677,10 +680,10 @@ impl PyWordPiece {
|
||||
#[pyo3(signature = (vocab, **kwargs))]
|
||||
#[pyo3(text_signature = "(vocab, **kwargs)")]
|
||||
fn from_file(
|
||||
_cls: &PyType,
|
||||
_cls: &Bound<'_, PyType>,
|
||||
py: Python,
|
||||
vocab: &str,
|
||||
kwargs: Option<&PyDict>,
|
||||
kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<Py<Self>> {
|
||||
let vocab = WordPiece::read_file(vocab).map_err(|e| {
|
||||
exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
|
||||
@ -796,7 +799,7 @@ impl PyWordLevel {
|
||||
#[pyo3(signature = (vocab, unk_token = None))]
|
||||
#[pyo3(text_signature = "(vocab, unk_token)")]
|
||||
fn from_file(
|
||||
_cls: &PyType,
|
||||
_cls: &Bound<'_, PyType>,
|
||||
py: Python,
|
||||
vocab: &str,
|
||||
unk_token: Option<String>,
|
||||
@ -849,7 +852,7 @@ impl PyUnigram {
|
||||
|
||||
/// Models Module
|
||||
#[pymodule]
|
||||
pub fn models(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
pub fn models(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<PyModel>()?;
|
||||
m.add_class::<PyBPE>()?;
|
||||
m.add_class::<PyWordPiece>()?;
|
||||
@ -870,7 +873,7 @@ mod test {
|
||||
Python::with_gil(|py| {
|
||||
let py_model = PyModel::from(BPE::default());
|
||||
let py_bpe = py_model.get_as_subtype(py).unwrap();
|
||||
assert_eq!("BPE", py_bpe.as_ref(py).get_type().name().unwrap());
|
||||
assert_eq!("BPE", py_bpe.bind(py).get_type().qualname().unwrap());
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -113,7 +113,7 @@ impl PyNormalizer {
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
||||
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||
}
|
||||
|
||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||
@ -345,7 +345,7 @@ pub struct PySequence {}
|
||||
impl PySequence {
|
||||
#[new]
|
||||
#[pyo3(text_signature = None)]
|
||||
fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> {
|
||||
fn new(normalizers: &Bound<'_, PyList>) -> PyResult<(Self, PyNormalizer)> {
|
||||
let mut sequence = Vec::with_capacity(normalizers.len());
|
||||
for n in normalizers.iter() {
|
||||
let normalizer: PyRef<PyNormalizer> = n.extract()?;
|
||||
@ -360,8 +360,8 @@ impl PySequence {
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, [PyList::empty(py)])
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
PyTuple::new_bound(py, [PyList::empty_bound(py)])
|
||||
}
|
||||
|
||||
fn __len__(&self) -> usize {
|
||||
@ -467,11 +467,11 @@ pub struct PyPrecompiled {}
|
||||
impl PyPrecompiled {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
|
||||
fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> {
|
||||
let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?;
|
||||
fn new(precompiled_charsmap: Vec<u8>) -> PyResult<(Self, PyNormalizer)> {
|
||||
// let precompiled_charsmap: Vec<u8> = FromPyObject::extract(py_precompiled_charsmap)?;
|
||||
Ok((
|
||||
PyPrecompiled {},
|
||||
Precompiled::from(precompiled_charsmap)
|
||||
Precompiled::from(&precompiled_charsmap)
|
||||
.map_err(|e| {
|
||||
exceptions::PyException::new_err(format!(
|
||||
"Error while attempting to build Precompiled normalizer: {}",
|
||||
@ -512,7 +512,7 @@ impl tk::tokenizer::Normalizer for CustomNormalizer {
|
||||
fn normalize(&self, normalized: &mut NormalizedString) -> tk::Result<()> {
|
||||
Python::with_gil(|py| {
|
||||
let normalized = PyNormalizedStringRefMut::new(normalized);
|
||||
let py_normalized = self.inner.as_ref(py);
|
||||
let py_normalized = self.inner.bind(py);
|
||||
py_normalized.call_method("normalize", (normalized.get(),), None)?;
|
||||
Ok(())
|
||||
})
|
||||
@ -635,7 +635,7 @@ impl Normalizer for PyNormalizerWrapper {
|
||||
|
||||
/// Normalizers Module
|
||||
#[pymodule]
|
||||
pub fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
pub fn normalizers(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<PyNormalizer>()?;
|
||||
m.add_class::<PyBertNormalizer>()?;
|
||||
m.add_class::<PyNFD>()?;
|
||||
@ -667,7 +667,7 @@ mod test {
|
||||
Python::with_gil(|py| {
|
||||
let py_norm = PyNormalizer::new(NFC.into());
|
||||
let py_nfc = py_norm.get_as_subtype(py).unwrap();
|
||||
assert_eq!("NFC", py_nfc.as_ref(py).get_type().name().unwrap());
|
||||
assert_eq!("NFC", py_nfc.bind(py).get_type().qualname().unwrap());
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -118,7 +118,7 @@ impl PyPreTokenizer {
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
||||
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||
}
|
||||
|
||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||
@ -263,7 +263,7 @@ impl PyByteLevel {
|
||||
fn new(
|
||||
add_prefix_space: bool,
|
||||
use_regex: bool,
|
||||
_kwargs: Option<&PyDict>,
|
||||
_kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> (Self, PyPreTokenizer) {
|
||||
(
|
||||
PyByteLevel {},
|
||||
@ -352,8 +352,8 @@ impl PySplit {
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, [" ", "removed"])
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
PyTuple::new_bound(py, [" ", "removed"])
|
||||
}
|
||||
}
|
||||
|
||||
@ -372,21 +372,21 @@ impl PyCharDelimiterSplit {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_delimiter(self_: PyRef<Self>, delimiter: PyChar) {
|
||||
setter!(self_, Delimiter, delimiter, delimiter.0);
|
||||
fn set_delimiter(self_: PyRef<Self>, delimiter: char) {
|
||||
setter!(self_, Delimiter, delimiter, delimiter);
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(text_signature = None)]
|
||||
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
pub fn new(delimiter: char) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((
|
||||
PyCharDelimiterSplit {},
|
||||
CharDelimiterSplit::new(delimiter.0).into(),
|
||||
CharDelimiterSplit::new(delimiter).into(),
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, [" "])
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
PyTuple::new_bound(py, [" "])
|
||||
}
|
||||
}
|
||||
|
||||
@ -430,7 +430,7 @@ pub struct PySequence {}
|
||||
impl PySequence {
|
||||
#[new]
|
||||
#[pyo3(text_signature = "(self, pretokenizers)")]
|
||||
fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
fn new(pre_tokenizers: &Bound<'_, PyList>) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
|
||||
for n in pre_tokenizers.iter() {
|
||||
let pretokenizer: PyRef<PyPreTokenizer> = n.extract()?;
|
||||
@ -447,8 +447,8 @@ impl PySequence {
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, [PyList::empty(py)])
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
PyTuple::new_bound(py, [PyList::empty_bound(py)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -490,8 +490,8 @@ impl PyMetaspace {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_replacement(self_: PyRef<Self>, replacement: PyChar) {
|
||||
setter!(self_, Metaspace, @set_replacement, replacement.0);
|
||||
fn set_replacement(self_: PyRef<Self>, replacement: char) {
|
||||
setter!(self_, Metaspace, @set_replacement, replacement);
|
||||
}
|
||||
|
||||
#[getter]
|
||||
@ -524,15 +524,15 @@ impl PyMetaspace {
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")]
|
||||
#[pyo3(signature = (replacement = '▁', prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")]
|
||||
fn new(
|
||||
replacement: PyChar,
|
||||
replacement: char,
|
||||
prepend_scheme: String,
|
||||
split: bool,
|
||||
) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
// Create a new Metaspace instance
|
||||
let prepend_scheme = from_string(prepend_scheme)?;
|
||||
let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme, split);
|
||||
let new_instance: Metaspace = Metaspace::new(replacement, prepend_scheme, split);
|
||||
Ok((PyMetaspace {}, new_instance.into()))
|
||||
}
|
||||
}
|
||||
@ -599,7 +599,7 @@ impl tk::tokenizer::PreTokenizer for CustomPreTokenizer {
|
||||
fn pre_tokenize(&self, sentence: &mut PreTokenizedString) -> tk::Result<()> {
|
||||
Python::with_gil(|py| {
|
||||
let pretok = PyPreTokenizedStringRefMut::new(sentence);
|
||||
let py_pretok = self.inner.as_ref(py);
|
||||
let py_pretok = self.inner.bind(py);
|
||||
py_pretok.call_method("pre_tokenize", (pretok.get(),), None)?;
|
||||
Ok(())
|
||||
})
|
||||
@ -722,7 +722,7 @@ impl PreTokenizer for PyPreTokenizerWrapper {
|
||||
|
||||
/// PreTokenizers Module
|
||||
#[pymodule]
|
||||
pub fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
pub fn pre_tokenizers(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<PyPreTokenizer>()?;
|
||||
m.add_class::<PyByteLevel>()?;
|
||||
m.add_class::<PyWhitespace>()?;
|
||||
@ -754,7 +754,7 @@ mod test {
|
||||
Python::with_gil(|py| {
|
||||
let py_norm = PyPreTokenizer::new(Whitespace {}.into());
|
||||
let py_wsp = py_norm.get_as_subtype(py).unwrap();
|
||||
assert_eq!("Whitespace", py_wsp.as_ref(py).get_type().name().unwrap());
|
||||
assert_eq!("Whitespace", py_wsp.bind(py).get_type().qualname().unwrap());
|
||||
})
|
||||
}
|
||||
|
||||
|
@ -78,7 +78,7 @@ impl PyPostProcessor {
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
||||
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||
}
|
||||
|
||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||
@ -166,8 +166,8 @@ impl PyBertProcessing {
|
||||
)
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, [("", 0), ("", 0)])
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
PyTuple::new_bound(py, [("", 0), ("", 0)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -216,8 +216,8 @@ impl PyRobertaProcessing {
|
||||
)
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, [("", 0), ("", 0)])
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
PyTuple::new_bound(py, [("", 0), ("", 0)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -235,7 +235,10 @@ pub struct PyByteLevel {}
|
||||
impl PyByteLevel {
|
||||
#[new]
|
||||
#[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
|
||||
fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
|
||||
fn new(
|
||||
trim_offsets: Option<bool>,
|
||||
_kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> (Self, PyPostProcessor) {
|
||||
let mut byte_level = ByteLevel::default();
|
||||
|
||||
if let Some(to) = trim_offsets {
|
||||
@ -304,7 +307,7 @@ impl FromPyObject<'_> for PyTemplate {
|
||||
Ok(Self(
|
||||
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
||||
))
|
||||
} else if let Ok(s) = ob.extract::<Vec<&str>>() {
|
||||
} else if let Ok(s) = ob.extract::<Vec<String>>() {
|
||||
Ok(Self(
|
||||
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
||||
))
|
||||
@ -424,7 +427,7 @@ pub struct PySequence {}
|
||||
impl PySequence {
|
||||
#[new]
|
||||
#[pyo3(signature = (processors_py), text_signature = "(self, processors)")]
|
||||
fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
|
||||
fn new(processors_py: &Bound<'_, PyList>) -> (Self, PyPostProcessor) {
|
||||
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
|
||||
for n in processors_py.iter() {
|
||||
let processor: PyRef<PyPostProcessor> = n.extract().unwrap();
|
||||
@ -438,14 +441,14 @@ impl PySequence {
|
||||
)
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, [PyList::empty(py)])
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
PyTuple::new_bound(py, [PyList::empty_bound(py)])
|
||||
}
|
||||
}
|
||||
|
||||
/// Processors Module
|
||||
#[pymodule]
|
||||
pub fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
pub fn processors(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<PyPostProcessor>()?;
|
||||
m.add_class::<PyBertProcessing>()?;
|
||||
m.add_class::<PyRobertaProcessing>()?;
|
||||
@ -474,7 +477,7 @@ mod test {
|
||||
let py_bert = py_proc.get_as_subtype(py).unwrap();
|
||||
assert_eq!(
|
||||
"BertProcessing",
|
||||
py_bert.as_ref(py).get_type().name().unwrap()
|
||||
py_bert.bind(py).get_type().qualname().unwrap()
|
||||
);
|
||||
})
|
||||
}
|
||||
|
@ -98,8 +98,8 @@ impl PyAddedToken {
|
||||
token
|
||||
}
|
||||
|
||||
pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> {
|
||||
let dict = PyDict::new(py);
|
||||
pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
|
||||
let dict = PyDict::new_bound(py);
|
||||
let token = self.get_token();
|
||||
|
||||
dict.set_item("content", token.content)?;
|
||||
@ -130,7 +130,7 @@ impl From<tk::AddedToken> for PyAddedToken {
|
||||
impl PyAddedToken {
|
||||
#[new]
|
||||
#[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False)")]
|
||||
fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
|
||||
fn __new__(content: Option<&str>, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<Self> {
|
||||
let mut token = PyAddedToken::from(content.unwrap_or(""), None);
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
@ -150,7 +150,7 @@ impl PyAddedToken {
|
||||
Ok(token)
|
||||
}
|
||||
|
||||
fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> {
|
||||
fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
|
||||
self.as_pydict(py)
|
||||
}
|
||||
|
||||
@ -329,7 +329,7 @@ impl FromPyObject<'_> for PyArrayUnicode {
|
||||
);
|
||||
let py = ob.py();
|
||||
let obj = PyObject::from_owned_ptr(py, unicode);
|
||||
let s = obj.downcast::<PyString>(py)?;
|
||||
let s = obj.downcast_bound::<PyString>(py)?;
|
||||
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
||||
})
|
||||
.collect::<PyResult<Vec<_>>>()?;
|
||||
@ -353,7 +353,7 @@ impl FromPyObject<'_> for PyArrayStr {
|
||||
.as_array()
|
||||
.iter()
|
||||
.map(|obj| {
|
||||
let s = obj.downcast::<PyString>(ob.py())?;
|
||||
let s = obj.downcast_bound::<PyString>(ob.py())?;
|
||||
Ok(s.to_string_lossy().into_owned())
|
||||
})
|
||||
.collect::<PyResult<Vec<_>>>()?;
|
||||
@ -377,12 +377,12 @@ impl<'s> FromPyObject<'s> for PreTokenizedInputSequence<'s> {
|
||||
return Ok(Self(seq.into()));
|
||||
}
|
||||
if let Ok(s) = ob.downcast::<PyList>() {
|
||||
if let Ok(seq) = s.extract::<Vec<&str>>() {
|
||||
if let Ok(seq) = s.extract::<Vec<String>>() {
|
||||
return Ok(Self(seq.into()));
|
||||
}
|
||||
}
|
||||
if let Ok(s) = ob.downcast::<PyTuple>() {
|
||||
if let Ok(seq) = s.extract::<Vec<&str>>() {
|
||||
if let Ok(seq) = s.extract::<Vec<String>>() {
|
||||
return Ok(Self(seq.into()));
|
||||
}
|
||||
}
|
||||
@ -492,7 +492,7 @@ impl PyTokenizer {
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
||||
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||
}
|
||||
|
||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||
@ -510,9 +510,9 @@ impl PyTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||
let model = PyModel::from(BPE::default()).into_py(py);
|
||||
PyTuple::new(py, vec![model])
|
||||
PyTuple::new_bound(py, vec![model])
|
||||
}
|
||||
|
||||
/// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
||||
@ -557,7 +557,7 @@ impl PyTokenizer {
|
||||
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||
#[staticmethod]
|
||||
#[pyo3(text_signature = "(buffer)")]
|
||||
fn from_buffer(buffer: &PyBytes) -> PyResult<Self> {
|
||||
fn from_buffer(buffer: &Bound<'_, PyBytes>) -> PyResult<Self> {
|
||||
let tokenizer = serde_json::from_slice(buffer.as_bytes()).map_err(|e| {
|
||||
exceptions::PyValueError::new_err(format!(
|
||||
"Cannot instantiate Tokenizer from buffer: {}",
|
||||
@ -591,18 +591,18 @@ impl PyTokenizer {
|
||||
auth_token: Option<String>,
|
||||
) -> PyResult<Self> {
|
||||
let path = Python::with_gil(|py| -> PyResult<String> {
|
||||
let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?;
|
||||
let huggingface_hub = PyModule::import_bound(py, intern!(py, "huggingface_hub"))?;
|
||||
let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?;
|
||||
let kwargs = [
|
||||
(intern!(py, "repo_id"), identifier),
|
||||
(intern!(py, "filename"), "tokenizer.json"),
|
||||
(intern!(py, "revision"), &revision),
|
||||
]
|
||||
.into_py_dict(py);
|
||||
.into_py_dict_bound(py);
|
||||
if let Some(auth_token) = auth_token {
|
||||
kwargs.set_item(intern!(py, "token"), auth_token)?;
|
||||
}
|
||||
let path: String = hf_hub_download.call((), Some(kwargs))?.extract()?;
|
||||
let path: String = hf_hub_download.call((), Some(&kwargs))?.extract()?;
|
||||
Ok(path)
|
||||
})?;
|
||||
|
||||
@ -712,7 +712,11 @@ impl PyTokenizer {
|
||||
#[pyo3(
|
||||
text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
|
||||
)]
|
||||
fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
fn enable_truncation(
|
||||
&mut self,
|
||||
max_length: usize,
|
||||
kwargs: Option<&Bound<'_, PyDict>>,
|
||||
) -> PyResult<()> {
|
||||
let mut params = TruncationParams {
|
||||
max_length,
|
||||
..Default::default()
|
||||
@ -777,9 +781,9 @@ impl PyTokenizer {
|
||||
/// (:obj:`dict`, `optional`):
|
||||
/// A dict with the current truncation parameters if truncation is enabled
|
||||
#[getter]
|
||||
fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyDict>> {
|
||||
fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> {
|
||||
self.tokenizer.get_truncation().map_or(Ok(None), |params| {
|
||||
let dict = PyDict::new(py);
|
||||
let dict = PyDict::new_bound(py);
|
||||
|
||||
dict.set_item("max_length", params.max_length)?;
|
||||
dict.set_item("stride", params.stride)?;
|
||||
@ -817,7 +821,7 @@ impl PyTokenizer {
|
||||
#[pyo3(
|
||||
text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
|
||||
)]
|
||||
fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
fn enable_padding(&mut self, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<()> {
|
||||
let mut params = PaddingParams::default();
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
@ -887,9 +891,9 @@ impl PyTokenizer {
|
||||
/// (:obj:`dict`, `optional`):
|
||||
/// A dict with the current padding parameters if padding is enabled
|
||||
#[getter]
|
||||
fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyDict>> {
|
||||
fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> {
|
||||
self.tokenizer.get_padding().map_or(Ok(None), |params| {
|
||||
let dict = PyDict::new(py);
|
||||
let dict = PyDict::new_bound(py);
|
||||
|
||||
dict.set_item(
|
||||
"length",
|
||||
@ -948,8 +952,8 @@ impl PyTokenizer {
|
||||
)]
|
||||
fn encode(
|
||||
&self,
|
||||
sequence: &PyAny,
|
||||
pair: Option<&PyAny>,
|
||||
sequence: &Bound<'_, PyAny>,
|
||||
pair: Option<&Bound<'_, PyAny>>,
|
||||
is_pretokenized: bool,
|
||||
add_special_tokens: bool,
|
||||
) -> PyResult<PyEncoding> {
|
||||
@ -1141,7 +1145,7 @@ impl PyTokenizer {
|
||||
/// Returns:
|
||||
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
||||
#[pyo3(text_signature = "(self, tokens)")]
|
||||
fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
||||
fn add_tokens(&mut self, tokens: &Bound<'_, PyList>) -> PyResult<usize> {
|
||||
let tokens = tokens
|
||||
.into_iter()
|
||||
.map(|token| {
|
||||
@ -1178,7 +1182,7 @@ impl PyTokenizer {
|
||||
/// Returns:
|
||||
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
||||
#[pyo3(text_signature = "(self, tokens)")]
|
||||
fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
||||
fn add_special_tokens(&mut self, tokens: &Bound<'_, PyList>) -> PyResult<usize> {
|
||||
let tokens = tokens
|
||||
.into_iter()
|
||||
.map(|token| {
|
||||
@ -1251,7 +1255,7 @@ impl PyTokenizer {
|
||||
fn train_from_iterator(
|
||||
&mut self,
|
||||
py: Python,
|
||||
iterator: &PyAny,
|
||||
iterator: &Bound<'_, PyAny>,
|
||||
trainer: Option<&mut PyTrainer>,
|
||||
length: Option<usize>,
|
||||
) -> PyResult<()> {
|
||||
|
@ -2,7 +2,6 @@ use std::sync::{Arc, RwLock};
|
||||
|
||||
use crate::models::PyModel;
|
||||
use crate::tokenizer::PyAddedToken;
|
||||
use crate::utils::PyChar;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
@ -52,7 +51,7 @@ impl PyTrainer {
|
||||
e
|
||||
))
|
||||
})?;
|
||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
||||
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||
}
|
||||
|
||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||
@ -215,7 +214,7 @@ impl PyBpeTrainer {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &PyList) -> PyResult<()> {
|
||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &Bound<'_, PyList>) -> PyResult<()> {
|
||||
setter!(
|
||||
self_,
|
||||
BpeTrainer,
|
||||
@ -269,12 +268,12 @@ impl PyBpeTrainer {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<PyChar>) {
|
||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<char>) {
|
||||
setter!(
|
||||
self_,
|
||||
BpeTrainer,
|
||||
initial_alphabet,
|
||||
alphabet.into_iter().map(|c| c.0).collect()
|
||||
alphabet.into_iter().collect()
|
||||
);
|
||||
}
|
||||
|
||||
@ -300,7 +299,7 @@ impl PyBpeTrainer {
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (**kwargs), text_signature = None)]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||
pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {
|
||||
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
@ -429,7 +428,7 @@ impl PyWordPieceTrainer {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &PyList) -> PyResult<()> {
|
||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &Bound<'_, PyList>) -> PyResult<()> {
|
||||
setter!(
|
||||
self_,
|
||||
WordPieceTrainer,
|
||||
@ -473,12 +472,12 @@ impl PyWordPieceTrainer {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<PyChar>) {
|
||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<char>) {
|
||||
setter!(
|
||||
self_,
|
||||
WordPieceTrainer,
|
||||
@set_initial_alphabet,
|
||||
alphabet.into_iter().map(|c| c.0).collect()
|
||||
alphabet.into_iter().collect()
|
||||
);
|
||||
}
|
||||
|
||||
@ -507,7 +506,7 @@ impl PyWordPieceTrainer {
|
||||
signature = (** kwargs),
|
||||
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
|
||||
)]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||
pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {
|
||||
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
@ -621,7 +620,7 @@ impl PyWordLevelTrainer {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &PyList) -> PyResult<()> {
|
||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &Bound<'_, PyList>) -> PyResult<()> {
|
||||
setter!(
|
||||
self_,
|
||||
WordLevelTrainer,
|
||||
@ -647,7 +646,7 @@ impl PyWordLevelTrainer {
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (**kwargs), text_signature = None)]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||
pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {
|
||||
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
@ -767,7 +766,7 @@ impl PyUnigramTrainer {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &PyList) -> PyResult<()> {
|
||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &Bound<'_, PyList>) -> PyResult<()> {
|
||||
setter!(
|
||||
self_,
|
||||
UnigramTrainer,
|
||||
@ -801,12 +800,12 @@ impl PyUnigramTrainer {
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<PyChar>) {
|
||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<char>) {
|
||||
setter!(
|
||||
self_,
|
||||
UnigramTrainer,
|
||||
initial_alphabet,
|
||||
alphabet.into_iter().map(|c| c.0).collect()
|
||||
alphabet.into_iter().collect()
|
||||
);
|
||||
}
|
||||
|
||||
@ -815,7 +814,7 @@ impl PyUnigramTrainer {
|
||||
signature = (**kwargs),
|
||||
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
|
||||
)]
|
||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
||||
pub fn new(kwargs: Option<Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {
|
||||
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
@ -874,7 +873,7 @@ impl PyUnigramTrainer {
|
||||
|
||||
/// Trainers Module
|
||||
#[pymodule]
|
||||
pub fn trainers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
pub fn trainers(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||
m.add_class::<PyTrainer>()?;
|
||||
m.add_class::<PyBpeTrainer>()?;
|
||||
m.add_class::<PyWordPieceTrainer>()?;
|
||||
@ -893,7 +892,7 @@ mod tests {
|
||||
Python::with_gil(|py| {
|
||||
let py_trainer = PyTrainer::new(Arc::new(RwLock::new(BpeTrainer::default().into())));
|
||||
let py_bpe = py_trainer.get_as_subtype(py).unwrap();
|
||||
assert_eq!("BpeTrainer", py_bpe.as_ref(py).get_type().name().unwrap());
|
||||
assert_eq!("BpeTrainer", py_bpe.bind(py).get_type().qualname().unwrap());
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -50,7 +50,7 @@ pub struct PyBufferedIterator<T, F> {
|
||||
|
||||
impl<T, F, I> PyBufferedIterator<T, F>
|
||||
where
|
||||
F: Fn(&PyAny) -> I,
|
||||
F: Fn(Bound<'_, PyAny>) -> I,
|
||||
I: IntoIterator<Item = PyResult<T>>,
|
||||
{
|
||||
/// Create a new PyBufferedIterator using the provided Python object.
|
||||
@ -62,10 +62,10 @@ where
|
||||
///
|
||||
/// The `buffer_size` represents the number of items that we buffer before we
|
||||
/// need to acquire the GIL again.
|
||||
pub fn new(iter: &PyAny, converter: F, buffer_size: usize) -> PyResult<Self> {
|
||||
pub fn new(iter: &Bound<'_, PyAny>, converter: F, buffer_size: usize) -> PyResult<Self> {
|
||||
let py = iter.py();
|
||||
let iter: Py<PyAny> = unsafe {
|
||||
py.from_borrowed_ptr_or_err::<PyAny>(pyo3::ffi::PyObject_GetIter(iter.as_ptr()))?
|
||||
Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))?
|
||||
.to_object(py)
|
||||
};
|
||||
|
||||
@ -89,9 +89,10 @@ where
|
||||
}
|
||||
|
||||
match unsafe {
|
||||
py.from_owned_ptr_or_opt::<PyAny>(pyo3::ffi::PyIter_Next(
|
||||
self.iter.as_ref().unwrap().as_ref(py).as_ptr(),
|
||||
))
|
||||
Bound::from_owned_ptr_or_opt(
|
||||
py,
|
||||
pyo3::ffi::PyIter_Next(self.iter.as_ref().unwrap().bind(py).as_ptr()),
|
||||
)
|
||||
} {
|
||||
Some(obj) => self.buffer.extend((self.converter)(obj)),
|
||||
None => {
|
||||
@ -112,7 +113,7 @@ where
|
||||
|
||||
impl<T, F, I> Iterator for PyBufferedIterator<T, F>
|
||||
where
|
||||
F: Fn(&PyAny) -> I,
|
||||
F: Fn(Bound<'_, PyAny>) -> I,
|
||||
I: IntoIterator<Item = PyResult<T>>,
|
||||
{
|
||||
type Item = PyResult<T>;
|
||||
|
@ -1,6 +1,3 @@
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
@ -14,25 +11,6 @@ pub use normalization::*;
|
||||
pub use pretokenization::*;
|
||||
pub use regex::*;
|
||||
|
||||
// PyChar
|
||||
// This type is a temporary hack to accept `char` as argument
|
||||
// To be removed once https://github.com/PyO3/pyo3/pull/1282 has been released
|
||||
pub struct PyChar(pub char);
|
||||
|
||||
impl FromPyObject<'_> for PyChar {
|
||||
fn extract(obj: &PyAny) -> PyResult<Self> {
|
||||
let s = <PyString as PyTryFrom<'_>>::try_from(obj)?.to_str()?;
|
||||
let mut iter = s.chars();
|
||||
if let (Some(ch), None) = (iter.next(), iter.next()) {
|
||||
Ok(Self(ch))
|
||||
} else {
|
||||
Err(exceptions::PyValueError::new_err(
|
||||
"expected a string of length 1",
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RefMut utils
|
||||
|
||||
pub trait DestroyPtr {
|
||||
|
@ -9,15 +9,15 @@ use tk::pattern::Pattern;
|
||||
|
||||
/// Represents a Pattern as used by `NormalizedString`
|
||||
#[derive(Clone, FromPyObject)]
|
||||
pub enum PyPattern<'p> {
|
||||
pub enum PyPattern {
|
||||
#[pyo3(annotation = "str")]
|
||||
Str(&'p str),
|
||||
Str(String),
|
||||
#[pyo3(annotation = "tokenizers.Regex")]
|
||||
Regex(Py<PyRegex>),
|
||||
// TODO: Add the compatibility for Fn(char) -> bool
|
||||
}
|
||||
|
||||
impl Pattern for PyPattern<'_> {
|
||||
impl Pattern for PyPattern {
|
||||
fn find_matches(&self, inside: &str) -> tk::Result<Vec<(tk::Offsets, bool)>> {
|
||||
match self {
|
||||
PyPattern::Str(s) => {
|
||||
@ -35,8 +35,8 @@ impl Pattern for PyPattern<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PyPattern<'_>> for tk::normalizers::replace::ReplacePattern {
|
||||
fn from(pattern: PyPattern<'_>) -> Self {
|
||||
impl From<PyPattern> for tk::normalizers::replace::ReplacePattern {
|
||||
fn from(pattern: PyPattern) -> Self {
|
||||
match pattern {
|
||||
PyPattern::Str(s) => Self::String(s.to_owned()),
|
||||
PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())),
|
||||
@ -44,8 +44,8 @@ impl From<PyPattern<'_>> for tk::normalizers::replace::ReplacePattern {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PyPattern<'_>> for tk::pre_tokenizers::split::SplitPattern {
|
||||
fn from(pattern: PyPattern<'_>) -> Self {
|
||||
impl From<PyPattern> for tk::pre_tokenizers::split::SplitPattern {
|
||||
fn from(pattern: PyPattern) -> Self {
|
||||
match pattern {
|
||||
PyPattern::Str(s) => Self::String(s.to_owned()),
|
||||
PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())),
|
||||
@ -117,7 +117,7 @@ impl From<PySplitDelimiterBehavior> for SplitDelimiterBehavior {
|
||||
}
|
||||
}
|
||||
|
||||
fn filter(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> {
|
||||
fn filter(normalized: &mut NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
let err = "`filter` expect a callable with the signature: `fn(char) -> bool`";
|
||||
|
||||
if !func.is_callable() {
|
||||
@ -134,7 +134,7 @@ fn filter(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> {
|
||||
}
|
||||
}
|
||||
|
||||
fn for_each(normalized: &NormalizedString, func: &PyAny) -> PyResult<()> {
|
||||
fn for_each(normalized: &NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
let err = "`for_each` expect a callable with the signature: `fn(char)`";
|
||||
|
||||
if !func.is_callable() {
|
||||
@ -148,14 +148,14 @@ fn for_each(normalized: &NormalizedString, func: &PyAny) -> PyResult<()> {
|
||||
}
|
||||
}
|
||||
|
||||
fn map(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> {
|
||||
fn map(normalized: &mut NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
let err = "`map` expect a callable with the signature: `fn(char) -> char`";
|
||||
|
||||
if !func.is_callable() {
|
||||
Err(exceptions::PyTypeError::new_err(err))
|
||||
} else {
|
||||
normalized.map(|c| {
|
||||
let c: &str = func
|
||||
let c: String = func
|
||||
.call1((c.to_string(),))
|
||||
.expect(err)
|
||||
.extract()
|
||||
@ -296,13 +296,13 @@ impl PyNormalizedString {
|
||||
|
||||
/// Filter each character of the string using the given func
|
||||
#[pyo3(text_signature = "(self, func)")]
|
||||
fn filter(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn filter(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
filter(&mut self.normalized, func)
|
||||
}
|
||||
|
||||
/// Calls the given function for each character of the string
|
||||
#[pyo3(text_signature = "(self, func)")]
|
||||
fn for_each(&self, func: &PyAny) -> PyResult<()> {
|
||||
fn for_each(&self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
for_each(&self.normalized, func)
|
||||
}
|
||||
|
||||
@ -311,7 +311,7 @@ impl PyNormalizedString {
|
||||
/// Replaces each character of the string using the returned value. Each
|
||||
/// returned value **must** be a str of length 1 (ie a character).
|
||||
#[pyo3(text_signature = "(self, func)")]
|
||||
fn map(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn map(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
map(&mut self.normalized, func)
|
||||
}
|
||||
|
||||
@ -551,21 +551,21 @@ impl PyNormalizedStringRefMut {
|
||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)?
|
||||
}
|
||||
|
||||
fn filter(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn filter(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
self.inner
|
||||
.map_mut(|n| filter(n, func))
|
||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn for_each(&self, func: &PyAny) -> PyResult<()> {
|
||||
fn for_each(&self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
self.inner
|
||||
.map(|n| for_each(n, func))
|
||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn map(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn map(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
self.inner
|
||||
.map_mut(|n| map(n, func))
|
||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
||||
|
@ -12,7 +12,7 @@ use crate::error::ToPyResult;
|
||||
use crate::token::PyToken;
|
||||
use tk::{OffsetReferential, OffsetType, Offsets, PreTokenizedString, Token};
|
||||
|
||||
fn split(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
||||
fn split(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
if !func.is_callable() {
|
||||
Err(exceptions::PyTypeError::new_err(
|
||||
"`split` expect a callable with the signature: \
|
||||
@ -30,7 +30,7 @@ fn split(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
||||
fn normalize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
if !func.is_callable() {
|
||||
Err(exceptions::PyTypeError::new_err(
|
||||
"`normalize` expect a callable with the signature: \
|
||||
@ -46,7 +46,7 @@ fn normalize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
||||
}
|
||||
}
|
||||
|
||||
fn tokenize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
||||
fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
if !func.is_callable() {
|
||||
Err(exceptions::PyTypeError::new_err(
|
||||
"`tokenize` expect a callable with the signature: \
|
||||
@ -183,7 +183,7 @@ impl PyPreTokenizedString {
|
||||
/// In order for the offsets to be tracked accurately, any returned `NormalizedString`
|
||||
/// should come from calling either `.split` or `.slice` on the received one.
|
||||
#[pyo3(text_signature = "(self, func)")]
|
||||
fn split(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn split(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
split(&mut self.pretok, func)
|
||||
}
|
||||
|
||||
@ -195,7 +195,7 @@ impl PyPreTokenizedString {
|
||||
/// does not need to return anything, just calling the methods on the provided
|
||||
/// NormalizedString allow its modification.
|
||||
#[pyo3(text_signature = "(self, func)")]
|
||||
fn normalize(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn normalize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
normalize(&mut self.pretok, func)
|
||||
}
|
||||
|
||||
@ -206,7 +206,7 @@ impl PyPreTokenizedString {
|
||||
/// The function used to tokenize each underlying split. This function must return
|
||||
/// a list of Token generated from the input str.
|
||||
#[pyo3(text_signature = "(self, func)")]
|
||||
fn tokenize(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn tokenize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
tokenize(&mut self.pretok, func)
|
||||
}
|
||||
|
||||
@ -289,19 +289,19 @@ impl PyPreTokenizedStringRefMut {
|
||||
|
||||
#[pymethods]
|
||||
impl PyPreTokenizedStringRefMut {
|
||||
fn split(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn split(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
self.inner
|
||||
.map_mut(|pretok| split(pretok, func))
|
||||
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
||||
}
|
||||
|
||||
fn normalize(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn normalize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
self.inner
|
||||
.map_mut(|pretok| normalize(pretok, func))
|
||||
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
||||
}
|
||||
|
||||
fn tokenize(&mut self, func: &PyAny) -> PyResult<()> {
|
||||
fn tokenize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||
self.inner
|
||||
.map_mut(|pretok| tokenize(pretok, func))
|
||||
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
||||
|
Reference in New Issue
Block a user