mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
@ -9,24 +9,24 @@ name = "tokenizers"
|
|||||||
crate-type = ["cdylib"]
|
crate-type = ["cdylib"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
rayon = "1.8"
|
rayon = "1.10"
|
||||||
serde = { version = "1.0", features = [ "rc", "derive" ]}
|
serde = { version = "1.0", features = [ "rc", "derive" ]}
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
env_logger = "0.10.0"
|
env_logger = "0.11"
|
||||||
pyo3 = { version = "0.20" }
|
pyo3 = { version = "0.21" }
|
||||||
numpy = "0.20.0"
|
numpy = "0.21"
|
||||||
ndarray = "0.15"
|
ndarray = "0.15"
|
||||||
onig = { version = "6.4", default-features = false }
|
onig = { version = "6.4", default-features = false }
|
||||||
itertools = "0.11"
|
itertools = "0.12"
|
||||||
|
|
||||||
[dependencies.tokenizers]
|
[dependencies.tokenizers]
|
||||||
version = "0.16.0-dev.0"
|
version = "0.16.0-dev.0"
|
||||||
path = "../../tokenizers"
|
path = "../../tokenizers"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3.8"
|
tempfile = "3.10"
|
||||||
pyo3 = { version = "0.20", features = ["auto-initialize"] }
|
pyo3 = { version = "0.21", features = ["auto-initialize"] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
defaut = ["pyo3/extension-module"]
|
defaut = ["pyo3/extension-module"]
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
use std::sync::{Arc, RwLock};
|
use std::sync::{Arc, RwLock};
|
||||||
|
|
||||||
use crate::pre_tokenizers::from_string;
|
use crate::pre_tokenizers::from_string;
|
||||||
use crate::utils::PyChar;
|
|
||||||
use crate::utils::PyPattern;
|
use crate::utils::PyPattern;
|
||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
@ -85,7 +84,7 @@ impl PyDecoder {
|
|||||||
e
|
e
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||||
@ -161,7 +160,7 @@ pub struct PyByteLevelDec {}
|
|||||||
impl PyByteLevelDec {
|
impl PyByteLevelDec {
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (**_kwargs), text_signature = "(self)")]
|
#[pyo3(signature = (**_kwargs), text_signature = "(self)")]
|
||||||
fn new(_kwargs: Option<&PyDict>) -> (Self, PyDecoder) {
|
fn new(_kwargs: Option<&Bound<'_, PyDict>>) -> (Self, PyDecoder) {
|
||||||
(PyByteLevelDec {}, ByteLevel::default().into())
|
(PyByteLevelDec {}, ByteLevel::default().into())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -318,8 +317,8 @@ impl PyMetaspaceDec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_replacement(self_: PyRef<Self>, replacement: PyChar) {
|
fn set_replacement(self_: PyRef<Self>, replacement: char) {
|
||||||
setter!(self_, Metaspace, @set_replacement, replacement.0);
|
setter!(self_, Metaspace, @set_replacement, replacement);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[getter]
|
#[getter]
|
||||||
@ -352,16 +351,12 @@ impl PyMetaspaceDec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")]
|
#[pyo3(signature = (replacement = '▁', prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")]
|
||||||
fn new(
|
fn new(replacement: char, prepend_scheme: String, split: bool) -> PyResult<(Self, PyDecoder)> {
|
||||||
replacement: PyChar,
|
|
||||||
prepend_scheme: String,
|
|
||||||
split: bool,
|
|
||||||
) -> PyResult<(Self, PyDecoder)> {
|
|
||||||
let prepend_scheme = from_string(prepend_scheme)?;
|
let prepend_scheme = from_string(prepend_scheme)?;
|
||||||
Ok((
|
Ok((
|
||||||
PyMetaspaceDec {},
|
PyMetaspaceDec {},
|
||||||
Metaspace::new(replacement.0, prepend_scheme, split).into(),
|
Metaspace::new(replacement, prepend_scheme, split).into(),
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -463,7 +458,7 @@ pub struct PySequenceDecoder {}
|
|||||||
impl PySequenceDecoder {
|
impl PySequenceDecoder {
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")]
|
#[pyo3(signature = (decoders_py), text_signature = "(self, decoders)")]
|
||||||
fn new(decoders_py: &PyList) -> PyResult<(Self, PyDecoder)> {
|
fn new(decoders_py: &Bound<'_, PyList>) -> PyResult<(Self, PyDecoder)> {
|
||||||
let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
|
let mut decoders: Vec<DecoderWrapper> = Vec::with_capacity(decoders_py.len());
|
||||||
for decoder_py in decoders_py.iter() {
|
for decoder_py in decoders_py.iter() {
|
||||||
let decoder: PyRef<PyDecoder> = decoder_py.extract()?;
|
let decoder: PyRef<PyDecoder> = decoder_py.extract()?;
|
||||||
@ -476,8 +471,8 @@ impl PySequenceDecoder {
|
|||||||
Ok((PySequenceDecoder {}, Sequence::new(decoders).into()))
|
Ok((PySequenceDecoder {}, Sequence::new(decoders).into()))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
PyTuple::new(py, [PyList::empty(py)])
|
PyTuple::new_bound(py, [PyList::empty_bound(py)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -497,7 +492,7 @@ impl Decoder for CustomDecoder {
|
|||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let decoded = self
|
let decoded = self
|
||||||
.inner
|
.inner
|
||||||
.call_method(py, "decode", (tokens,), None)?
|
.call_method_bound(py, "decode", (tokens,), None)?
|
||||||
.extract(py)?;
|
.extract(py)?;
|
||||||
Ok(decoded)
|
Ok(decoded)
|
||||||
})
|
})
|
||||||
@ -507,7 +502,7 @@ impl Decoder for CustomDecoder {
|
|||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let decoded = self
|
let decoded = self
|
||||||
.inner
|
.inner
|
||||||
.call_method(py, "decode_chain", (tokens,), None)?
|
.call_method_bound(py, "decode_chain", (tokens,), None)?
|
||||||
.extract(py)?;
|
.extract(py)?;
|
||||||
Ok(decoded)
|
Ok(decoded)
|
||||||
})
|
})
|
||||||
@ -572,7 +567,7 @@ impl Decoder for PyDecoderWrapper {
|
|||||||
|
|
||||||
/// Decoders Module
|
/// Decoders Module
|
||||||
#[pymodule]
|
#[pymodule]
|
||||||
pub fn decoders(_py: Python, m: &PyModule) -> PyResult<()> {
|
pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
m.add_class::<PyDecoder>()?;
|
m.add_class::<PyDecoder>()?;
|
||||||
m.add_class::<PyByteLevelDec>()?;
|
m.add_class::<PyByteLevelDec>()?;
|
||||||
m.add_class::<PyReplaceDec>()?;
|
m.add_class::<PyReplaceDec>()?;
|
||||||
@ -602,7 +597,7 @@ mod test {
|
|||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let py_dec = PyDecoder::new(Metaspace::default().into());
|
let py_dec = PyDecoder::new(Metaspace::default().into());
|
||||||
let py_meta = py_dec.get_as_subtype(py).unwrap();
|
let py_meta = py_dec.get_as_subtype(py).unwrap();
|
||||||
assert_eq!("Metaspace", py_meta.as_ref(py).get_type().name().unwrap());
|
assert_eq!("Metaspace", py_meta.bind(py).get_type().qualname().unwrap());
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ impl PyEncoding {
|
|||||||
e
|
e
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||||
@ -391,10 +391,10 @@ impl PyEncoding {
|
|||||||
#[pyo3(
|
#[pyo3(
|
||||||
text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"
|
text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"
|
||||||
)]
|
)]
|
||||||
fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
fn pad(&mut self, length: usize, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<()> {
|
||||||
let mut pad_id = 0;
|
let mut pad_id = 0;
|
||||||
let mut pad_type_id = 0;
|
let mut pad_type_id = 0;
|
||||||
let mut pad_token = "[PAD]";
|
let mut pad_token = "[PAD]".to_string();
|
||||||
let mut direction = PaddingDirection::Right;
|
let mut direction = PaddingDirection::Right;
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
@ -422,7 +422,7 @@ impl PyEncoding {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.encoding
|
self.encoding
|
||||||
.pad(length, pad_id, pad_type_id, pad_token, direction);
|
.pad(length, pad_id, pad_type_id, &pad_token, direction);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ impl<T> ToPyResult<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> {
|
pub(crate) fn deprecation_warning(py: Python<'_>, version: &str, message: &str) -> PyResult<()> {
|
||||||
let deprecation_warning = py.import("builtins")?.getattr("DeprecationWarning")?;
|
let deprecation_warning = py.import_bound("builtins")?.getattr("DeprecationWarning")?;
|
||||||
let full_message = format!("Deprecated in {}: {}", version, message);
|
let full_message = format!("Deprecated in {}: {}", version, message);
|
||||||
pyo3::PyErr::warn(py, deprecation_warning, &full_message, 0)
|
pyo3::PyErr::warn_bound(py, &deprecation_warning, &full_message, 0)
|
||||||
}
|
}
|
||||||
|
@ -47,7 +47,7 @@ extern "C" fn child_after_fork() {
|
|||||||
|
|
||||||
/// Tokenizers Module
|
/// Tokenizers Module
|
||||||
#[pymodule]
|
#[pymodule]
|
||||||
pub fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
pub fn tokenizers(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
let _ = env_logger::try_init_from_env("TOKENIZERS_LOG");
|
let _ = env_logger::try_init_from_env("TOKENIZERS_LOG");
|
||||||
|
|
||||||
// Register the fork callback
|
// Register the fork callback
|
||||||
|
@ -105,7 +105,7 @@ impl PyModel {
|
|||||||
e
|
e
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||||
@ -260,7 +260,10 @@ impl PyModel {
|
|||||||
pub struct PyBPE {}
|
pub struct PyBPE {}
|
||||||
|
|
||||||
impl PyBPE {
|
impl PyBPE {
|
||||||
fn with_builder(mut builder: BpeBuilder, kwargs: Option<&PyDict>) -> PyResult<(Self, PyModel)> {
|
fn with_builder(
|
||||||
|
mut builder: BpeBuilder,
|
||||||
|
kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
|
) -> PyResult<(Self, PyModel)> {
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, value) in kwargs {
|
for (key, value) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: &str = key.extract()?;
|
||||||
@ -321,14 +324,14 @@ macro_rules! setter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
enum PyVocab<'a> {
|
enum PyVocab {
|
||||||
Vocab(Vocab),
|
Vocab(Vocab),
|
||||||
Filename(&'a str),
|
Filename(String),
|
||||||
}
|
}
|
||||||
#[derive(FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
enum PyMerges<'a> {
|
enum PyMerges {
|
||||||
Merges(Merges),
|
Merges(Merges),
|
||||||
Filename(&'a str),
|
Filename(String),
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
@ -417,7 +420,7 @@ impl PyBPE {
|
|||||||
py: Python<'_>,
|
py: Python<'_>,
|
||||||
vocab: Option<PyVocab>,
|
vocab: Option<PyVocab>,
|
||||||
merges: Option<PyMerges>,
|
merges: Option<PyMerges>,
|
||||||
kwargs: Option<&PyDict>,
|
kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
) -> PyResult<(Self, PyModel)> {
|
) -> PyResult<(Self, PyModel)> {
|
||||||
if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
|
if (vocab.is_some() && merges.is_none()) || (vocab.is_none() && merges.is_some()) {
|
||||||
return Err(exceptions::PyValueError::new_err(
|
return Err(exceptions::PyValueError::new_err(
|
||||||
@ -502,11 +505,11 @@ impl PyBPE {
|
|||||||
#[pyo3(signature = (vocab, merges, **kwargs))]
|
#[pyo3(signature = (vocab, merges, **kwargs))]
|
||||||
#[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")]
|
#[pyo3(text_signature = "(cls, vocab, merge, **kwargs)")]
|
||||||
fn from_file(
|
fn from_file(
|
||||||
_cls: &PyType,
|
_cls: &Bound<'_, PyType>,
|
||||||
py: Python,
|
py: Python,
|
||||||
vocab: &str,
|
vocab: &str,
|
||||||
merges: &str,
|
merges: &str,
|
||||||
kwargs: Option<&PyDict>,
|
kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
) -> PyResult<Py<Self>> {
|
) -> PyResult<Py<Self>> {
|
||||||
let (vocab, merges) = BPE::read_file(vocab, merges).map_err(|e| {
|
let (vocab, merges) = BPE::read_file(vocab, merges).map_err(|e| {
|
||||||
exceptions::PyException::new_err(format!("Error while reading BPE files: {}", e))
|
exceptions::PyException::new_err(format!("Error while reading BPE files: {}", e))
|
||||||
@ -540,7 +543,7 @@ pub struct PyWordPiece {}
|
|||||||
impl PyWordPiece {
|
impl PyWordPiece {
|
||||||
fn with_builder(
|
fn with_builder(
|
||||||
mut builder: WordPieceBuilder,
|
mut builder: WordPieceBuilder,
|
||||||
kwargs: Option<&PyDict>,
|
kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
) -> PyResult<(Self, PyModel)> {
|
) -> PyResult<(Self, PyModel)> {
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
@ -612,7 +615,7 @@ impl PyWordPiece {
|
|||||||
fn new(
|
fn new(
|
||||||
py: Python<'_>,
|
py: Python<'_>,
|
||||||
vocab: Option<PyVocab>,
|
vocab: Option<PyVocab>,
|
||||||
kwargs: Option<&PyDict>,
|
kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
) -> PyResult<(Self, PyModel)> {
|
) -> PyResult<(Self, PyModel)> {
|
||||||
let mut builder = WordPiece::builder();
|
let mut builder = WordPiece::builder();
|
||||||
|
|
||||||
@ -677,10 +680,10 @@ impl PyWordPiece {
|
|||||||
#[pyo3(signature = (vocab, **kwargs))]
|
#[pyo3(signature = (vocab, **kwargs))]
|
||||||
#[pyo3(text_signature = "(vocab, **kwargs)")]
|
#[pyo3(text_signature = "(vocab, **kwargs)")]
|
||||||
fn from_file(
|
fn from_file(
|
||||||
_cls: &PyType,
|
_cls: &Bound<'_, PyType>,
|
||||||
py: Python,
|
py: Python,
|
||||||
vocab: &str,
|
vocab: &str,
|
||||||
kwargs: Option<&PyDict>,
|
kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
) -> PyResult<Py<Self>> {
|
) -> PyResult<Py<Self>> {
|
||||||
let vocab = WordPiece::read_file(vocab).map_err(|e| {
|
let vocab = WordPiece::read_file(vocab).map_err(|e| {
|
||||||
exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
|
exceptions::PyException::new_err(format!("Error while reading WordPiece file: {}", e))
|
||||||
@ -796,7 +799,7 @@ impl PyWordLevel {
|
|||||||
#[pyo3(signature = (vocab, unk_token = None))]
|
#[pyo3(signature = (vocab, unk_token = None))]
|
||||||
#[pyo3(text_signature = "(vocab, unk_token)")]
|
#[pyo3(text_signature = "(vocab, unk_token)")]
|
||||||
fn from_file(
|
fn from_file(
|
||||||
_cls: &PyType,
|
_cls: &Bound<'_, PyType>,
|
||||||
py: Python,
|
py: Python,
|
||||||
vocab: &str,
|
vocab: &str,
|
||||||
unk_token: Option<String>,
|
unk_token: Option<String>,
|
||||||
@ -849,7 +852,7 @@ impl PyUnigram {
|
|||||||
|
|
||||||
/// Models Module
|
/// Models Module
|
||||||
#[pymodule]
|
#[pymodule]
|
||||||
pub fn models(_py: Python, m: &PyModule) -> PyResult<()> {
|
pub fn models(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
m.add_class::<PyModel>()?;
|
m.add_class::<PyModel>()?;
|
||||||
m.add_class::<PyBPE>()?;
|
m.add_class::<PyBPE>()?;
|
||||||
m.add_class::<PyWordPiece>()?;
|
m.add_class::<PyWordPiece>()?;
|
||||||
@ -870,7 +873,7 @@ mod test {
|
|||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let py_model = PyModel::from(BPE::default());
|
let py_model = PyModel::from(BPE::default());
|
||||||
let py_bpe = py_model.get_as_subtype(py).unwrap();
|
let py_bpe = py_model.get_as_subtype(py).unwrap();
|
||||||
assert_eq!("BPE", py_bpe.as_ref(py).get_type().name().unwrap());
|
assert_eq!("BPE", py_bpe.bind(py).get_type().qualname().unwrap());
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -113,7 +113,7 @@ impl PyNormalizer {
|
|||||||
e
|
e
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||||
@ -345,7 +345,7 @@ pub struct PySequence {}
|
|||||||
impl PySequence {
|
impl PySequence {
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(text_signature = None)]
|
#[pyo3(text_signature = None)]
|
||||||
fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> {
|
fn new(normalizers: &Bound<'_, PyList>) -> PyResult<(Self, PyNormalizer)> {
|
||||||
let mut sequence = Vec::with_capacity(normalizers.len());
|
let mut sequence = Vec::with_capacity(normalizers.len());
|
||||||
for n in normalizers.iter() {
|
for n in normalizers.iter() {
|
||||||
let normalizer: PyRef<PyNormalizer> = n.extract()?;
|
let normalizer: PyRef<PyNormalizer> = n.extract()?;
|
||||||
@ -360,8 +360,8 @@ impl PySequence {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
PyTuple::new(py, [PyList::empty(py)])
|
PyTuple::new_bound(py, [PyList::empty_bound(py)])
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __len__(&self) -> usize {
|
fn __len__(&self) -> usize {
|
||||||
@ -467,11 +467,11 @@ pub struct PyPrecompiled {}
|
|||||||
impl PyPrecompiled {
|
impl PyPrecompiled {
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
|
#[pyo3(text_signature = "(self, precompiled_charsmap)")]
|
||||||
fn new(py_precompiled_charsmap: &PyBytes) -> PyResult<(Self, PyNormalizer)> {
|
fn new(precompiled_charsmap: Vec<u8>) -> PyResult<(Self, PyNormalizer)> {
|
||||||
let precompiled_charsmap: &[u8] = FromPyObject::extract(py_precompiled_charsmap)?;
|
// let precompiled_charsmap: Vec<u8> = FromPyObject::extract(py_precompiled_charsmap)?;
|
||||||
Ok((
|
Ok((
|
||||||
PyPrecompiled {},
|
PyPrecompiled {},
|
||||||
Precompiled::from(precompiled_charsmap)
|
Precompiled::from(&precompiled_charsmap)
|
||||||
.map_err(|e| {
|
.map_err(|e| {
|
||||||
exceptions::PyException::new_err(format!(
|
exceptions::PyException::new_err(format!(
|
||||||
"Error while attempting to build Precompiled normalizer: {}",
|
"Error while attempting to build Precompiled normalizer: {}",
|
||||||
@ -512,7 +512,7 @@ impl tk::tokenizer::Normalizer for CustomNormalizer {
|
|||||||
fn normalize(&self, normalized: &mut NormalizedString) -> tk::Result<()> {
|
fn normalize(&self, normalized: &mut NormalizedString) -> tk::Result<()> {
|
||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let normalized = PyNormalizedStringRefMut::new(normalized);
|
let normalized = PyNormalizedStringRefMut::new(normalized);
|
||||||
let py_normalized = self.inner.as_ref(py);
|
let py_normalized = self.inner.bind(py);
|
||||||
py_normalized.call_method("normalize", (normalized.get(),), None)?;
|
py_normalized.call_method("normalize", (normalized.get(),), None)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
@ -635,7 +635,7 @@ impl Normalizer for PyNormalizerWrapper {
|
|||||||
|
|
||||||
/// Normalizers Module
|
/// Normalizers Module
|
||||||
#[pymodule]
|
#[pymodule]
|
||||||
pub fn normalizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
pub fn normalizers(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
m.add_class::<PyNormalizer>()?;
|
m.add_class::<PyNormalizer>()?;
|
||||||
m.add_class::<PyBertNormalizer>()?;
|
m.add_class::<PyBertNormalizer>()?;
|
||||||
m.add_class::<PyNFD>()?;
|
m.add_class::<PyNFD>()?;
|
||||||
@ -667,7 +667,7 @@ mod test {
|
|||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let py_norm = PyNormalizer::new(NFC.into());
|
let py_norm = PyNormalizer::new(NFC.into());
|
||||||
let py_nfc = py_norm.get_as_subtype(py).unwrap();
|
let py_nfc = py_norm.get_as_subtype(py).unwrap();
|
||||||
assert_eq!("NFC", py_nfc.as_ref(py).get_type().name().unwrap());
|
assert_eq!("NFC", py_nfc.bind(py).get_type().qualname().unwrap());
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,7 +118,7 @@ impl PyPreTokenizer {
|
|||||||
e
|
e
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||||
@ -263,7 +263,7 @@ impl PyByteLevel {
|
|||||||
fn new(
|
fn new(
|
||||||
add_prefix_space: bool,
|
add_prefix_space: bool,
|
||||||
use_regex: bool,
|
use_regex: bool,
|
||||||
_kwargs: Option<&PyDict>,
|
_kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
) -> (Self, PyPreTokenizer) {
|
) -> (Self, PyPreTokenizer) {
|
||||||
(
|
(
|
||||||
PyByteLevel {},
|
PyByteLevel {},
|
||||||
@ -352,8 +352,8 @@ impl PySplit {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
PyTuple::new(py, [" ", "removed"])
|
PyTuple::new_bound(py, [" ", "removed"])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -372,21 +372,21 @@ impl PyCharDelimiterSplit {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_delimiter(self_: PyRef<Self>, delimiter: PyChar) {
|
fn set_delimiter(self_: PyRef<Self>, delimiter: char) {
|
||||||
setter!(self_, Delimiter, delimiter, delimiter.0);
|
setter!(self_, Delimiter, delimiter, delimiter);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(text_signature = None)]
|
#[pyo3(text_signature = None)]
|
||||||
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
|
pub fn new(delimiter: char) -> PyResult<(Self, PyPreTokenizer)> {
|
||||||
Ok((
|
Ok((
|
||||||
PyCharDelimiterSplit {},
|
PyCharDelimiterSplit {},
|
||||||
CharDelimiterSplit::new(delimiter.0).into(),
|
CharDelimiterSplit::new(delimiter).into(),
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
PyTuple::new(py, [" "])
|
PyTuple::new_bound(py, [" "])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -430,7 +430,7 @@ pub struct PySequence {}
|
|||||||
impl PySequence {
|
impl PySequence {
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(text_signature = "(self, pretokenizers)")]
|
#[pyo3(text_signature = "(self, pretokenizers)")]
|
||||||
fn new(pre_tokenizers: &PyList) -> PyResult<(Self, PyPreTokenizer)> {
|
fn new(pre_tokenizers: &Bound<'_, PyList>) -> PyResult<(Self, PyPreTokenizer)> {
|
||||||
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
|
let mut sequence = Vec::with_capacity(pre_tokenizers.len());
|
||||||
for n in pre_tokenizers.iter() {
|
for n in pre_tokenizers.iter() {
|
||||||
let pretokenizer: PyRef<PyPreTokenizer> = n.extract()?;
|
let pretokenizer: PyRef<PyPreTokenizer> = n.extract()?;
|
||||||
@ -447,8 +447,8 @@ impl PySequence {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
PyTuple::new(py, [PyList::empty(py)])
|
PyTuple::new_bound(py, [PyList::empty_bound(py)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -490,8 +490,8 @@ impl PyMetaspace {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_replacement(self_: PyRef<Self>, replacement: PyChar) {
|
fn set_replacement(self_: PyRef<Self>, replacement: char) {
|
||||||
setter!(self_, Metaspace, @set_replacement, replacement.0);
|
setter!(self_, Metaspace, @set_replacement, replacement);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[getter]
|
#[getter]
|
||||||
@ -524,15 +524,15 @@ impl PyMetaspace {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")]
|
#[pyo3(signature = (replacement = '▁', prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")]
|
||||||
fn new(
|
fn new(
|
||||||
replacement: PyChar,
|
replacement: char,
|
||||||
prepend_scheme: String,
|
prepend_scheme: String,
|
||||||
split: bool,
|
split: bool,
|
||||||
) -> PyResult<(Self, PyPreTokenizer)> {
|
) -> PyResult<(Self, PyPreTokenizer)> {
|
||||||
// Create a new Metaspace instance
|
// Create a new Metaspace instance
|
||||||
let prepend_scheme = from_string(prepend_scheme)?;
|
let prepend_scheme = from_string(prepend_scheme)?;
|
||||||
let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme, split);
|
let new_instance: Metaspace = Metaspace::new(replacement, prepend_scheme, split);
|
||||||
Ok((PyMetaspace {}, new_instance.into()))
|
Ok((PyMetaspace {}, new_instance.into()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -599,7 +599,7 @@ impl tk::tokenizer::PreTokenizer for CustomPreTokenizer {
|
|||||||
fn pre_tokenize(&self, sentence: &mut PreTokenizedString) -> tk::Result<()> {
|
fn pre_tokenize(&self, sentence: &mut PreTokenizedString) -> tk::Result<()> {
|
||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let pretok = PyPreTokenizedStringRefMut::new(sentence);
|
let pretok = PyPreTokenizedStringRefMut::new(sentence);
|
||||||
let py_pretok = self.inner.as_ref(py);
|
let py_pretok = self.inner.bind(py);
|
||||||
py_pretok.call_method("pre_tokenize", (pretok.get(),), None)?;
|
py_pretok.call_method("pre_tokenize", (pretok.get(),), None)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
})
|
||||||
@ -722,7 +722,7 @@ impl PreTokenizer for PyPreTokenizerWrapper {
|
|||||||
|
|
||||||
/// PreTokenizers Module
|
/// PreTokenizers Module
|
||||||
#[pymodule]
|
#[pymodule]
|
||||||
pub fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
pub fn pre_tokenizers(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
m.add_class::<PyPreTokenizer>()?;
|
m.add_class::<PyPreTokenizer>()?;
|
||||||
m.add_class::<PyByteLevel>()?;
|
m.add_class::<PyByteLevel>()?;
|
||||||
m.add_class::<PyWhitespace>()?;
|
m.add_class::<PyWhitespace>()?;
|
||||||
@ -754,7 +754,7 @@ mod test {
|
|||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let py_norm = PyPreTokenizer::new(Whitespace {}.into());
|
let py_norm = PyPreTokenizer::new(Whitespace {}.into());
|
||||||
let py_wsp = py_norm.get_as_subtype(py).unwrap();
|
let py_wsp = py_norm.get_as_subtype(py).unwrap();
|
||||||
assert_eq!("Whitespace", py_wsp.as_ref(py).get_type().name().unwrap());
|
assert_eq!("Whitespace", py_wsp.bind(py).get_type().qualname().unwrap());
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ impl PyPostProcessor {
|
|||||||
e
|
e
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||||
@ -166,8 +166,8 @@ impl PyBertProcessing {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
PyTuple::new(py, [("", 0), ("", 0)])
|
PyTuple::new_bound(py, [("", 0), ("", 0)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -216,8 +216,8 @@ impl PyRobertaProcessing {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
PyTuple::new(py, [("", 0), ("", 0)])
|
PyTuple::new_bound(py, [("", 0), ("", 0)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -235,7 +235,10 @@ pub struct PyByteLevel {}
|
|||||||
impl PyByteLevel {
|
impl PyByteLevel {
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
|
#[pyo3(signature = (trim_offsets = None, **_kwargs), text_signature = "(self, trim_offsets=True)")]
|
||||||
fn new(trim_offsets: Option<bool>, _kwargs: Option<&PyDict>) -> (Self, PyPostProcessor) {
|
fn new(
|
||||||
|
trim_offsets: Option<bool>,
|
||||||
|
_kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
|
) -> (Self, PyPostProcessor) {
|
||||||
let mut byte_level = ByteLevel::default();
|
let mut byte_level = ByteLevel::default();
|
||||||
|
|
||||||
if let Some(to) = trim_offsets {
|
if let Some(to) = trim_offsets {
|
||||||
@ -304,7 +307,7 @@ impl FromPyObject<'_> for PyTemplate {
|
|||||||
Ok(Self(
|
Ok(Self(
|
||||||
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
||||||
))
|
))
|
||||||
} else if let Ok(s) = ob.extract::<Vec<&str>>() {
|
} else if let Ok(s) = ob.extract::<Vec<String>>() {
|
||||||
Ok(Self(
|
Ok(Self(
|
||||||
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
||||||
))
|
))
|
||||||
@ -424,7 +427,7 @@ pub struct PySequence {}
|
|||||||
impl PySequence {
|
impl PySequence {
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (processors_py), text_signature = "(self, processors)")]
|
#[pyo3(signature = (processors_py), text_signature = "(self, processors)")]
|
||||||
fn new(processors_py: &PyList) -> (Self, PyPostProcessor) {
|
fn new(processors_py: &Bound<'_, PyList>) -> (Self, PyPostProcessor) {
|
||||||
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
|
let mut processors: Vec<PostProcessorWrapper> = Vec::with_capacity(processors_py.len());
|
||||||
for n in processors_py.iter() {
|
for n in processors_py.iter() {
|
||||||
let processor: PyRef<PyPostProcessor> = n.extract().unwrap();
|
let processor: PyRef<PyPostProcessor> = n.extract().unwrap();
|
||||||
@ -438,14 +441,14 @@ impl PySequence {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
PyTuple::new(py, [PyList::empty(py)])
|
PyTuple::new_bound(py, [PyList::empty_bound(py)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Processors Module
|
/// Processors Module
|
||||||
#[pymodule]
|
#[pymodule]
|
||||||
pub fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
|
pub fn processors(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
m.add_class::<PyPostProcessor>()?;
|
m.add_class::<PyPostProcessor>()?;
|
||||||
m.add_class::<PyBertProcessing>()?;
|
m.add_class::<PyBertProcessing>()?;
|
||||||
m.add_class::<PyRobertaProcessing>()?;
|
m.add_class::<PyRobertaProcessing>()?;
|
||||||
@ -474,7 +477,7 @@ mod test {
|
|||||||
let py_bert = py_proc.get_as_subtype(py).unwrap();
|
let py_bert = py_proc.get_as_subtype(py).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
"BertProcessing",
|
"BertProcessing",
|
||||||
py_bert.as_ref(py).get_type().name().unwrap()
|
py_bert.bind(py).get_type().qualname().unwrap()
|
||||||
);
|
);
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -98,8 +98,8 @@ impl PyAddedToken {
|
|||||||
token
|
token
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> {
|
pub fn as_pydict<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
|
||||||
let dict = PyDict::new(py);
|
let dict = PyDict::new_bound(py);
|
||||||
let token = self.get_token();
|
let token = self.get_token();
|
||||||
|
|
||||||
dict.set_item("content", token.content)?;
|
dict.set_item("content", token.content)?;
|
||||||
@ -130,7 +130,7 @@ impl From<tk::AddedToken> for PyAddedToken {
|
|||||||
impl PyAddedToken {
|
impl PyAddedToken {
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False)")]
|
#[pyo3(signature = (content=None, **kwargs), text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False)")]
|
||||||
fn __new__(content: Option<&str>, kwargs: Option<&PyDict>) -> PyResult<Self> {
|
fn __new__(content: Option<&str>, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<Self> {
|
||||||
let mut token = PyAddedToken::from(content.unwrap_or(""), None);
|
let mut token = PyAddedToken::from(content.unwrap_or(""), None);
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
@ -150,7 +150,7 @@ impl PyAddedToken {
|
|||||||
Ok(token)
|
Ok(token)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<&'py PyDict> {
|
fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
|
||||||
self.as_pydict(py)
|
self.as_pydict(py)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -329,7 +329,7 @@ impl FromPyObject<'_> for PyArrayUnicode {
|
|||||||
);
|
);
|
||||||
let py = ob.py();
|
let py = ob.py();
|
||||||
let obj = PyObject::from_owned_ptr(py, unicode);
|
let obj = PyObject::from_owned_ptr(py, unicode);
|
||||||
let s = obj.downcast::<PyString>(py)?;
|
let s = obj.downcast_bound::<PyString>(py)?;
|
||||||
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
||||||
})
|
})
|
||||||
.collect::<PyResult<Vec<_>>>()?;
|
.collect::<PyResult<Vec<_>>>()?;
|
||||||
@ -353,7 +353,7 @@ impl FromPyObject<'_> for PyArrayStr {
|
|||||||
.as_array()
|
.as_array()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|obj| {
|
.map(|obj| {
|
||||||
let s = obj.downcast::<PyString>(ob.py())?;
|
let s = obj.downcast_bound::<PyString>(ob.py())?;
|
||||||
Ok(s.to_string_lossy().into_owned())
|
Ok(s.to_string_lossy().into_owned())
|
||||||
})
|
})
|
||||||
.collect::<PyResult<Vec<_>>>()?;
|
.collect::<PyResult<Vec<_>>>()?;
|
||||||
@ -377,12 +377,12 @@ impl<'s> FromPyObject<'s> for PreTokenizedInputSequence<'s> {
|
|||||||
return Ok(Self(seq.into()));
|
return Ok(Self(seq.into()));
|
||||||
}
|
}
|
||||||
if let Ok(s) = ob.downcast::<PyList>() {
|
if let Ok(s) = ob.downcast::<PyList>() {
|
||||||
if let Ok(seq) = s.extract::<Vec<&str>>() {
|
if let Ok(seq) = s.extract::<Vec<String>>() {
|
||||||
return Ok(Self(seq.into()));
|
return Ok(Self(seq.into()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if let Ok(s) = ob.downcast::<PyTuple>() {
|
if let Ok(s) = ob.downcast::<PyTuple>() {
|
||||||
if let Ok(seq) = s.extract::<Vec<&str>>() {
|
if let Ok(seq) = s.extract::<Vec<String>>() {
|
||||||
return Ok(Self(seq.into()));
|
return Ok(Self(seq.into()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -492,7 +492,7 @@ impl PyTokenizer {
|
|||||||
e
|
e
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||||
@ -510,9 +510,9 @@ impl PyTokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> Bound<'p, PyTuple> {
|
||||||
let model = PyModel::from(BPE::default()).into_py(py);
|
let model = PyModel::from(BPE::default()).into_py(py);
|
||||||
PyTuple::new(py, vec![model])
|
PyTuple::new_bound(py, vec![model])
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
/// Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string.
|
||||||
@ -557,7 +557,7 @@ impl PyTokenizer {
|
|||||||
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
|
/// :class:`~tokenizers.Tokenizer`: The new tokenizer
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[pyo3(text_signature = "(buffer)")]
|
#[pyo3(text_signature = "(buffer)")]
|
||||||
fn from_buffer(buffer: &PyBytes) -> PyResult<Self> {
|
fn from_buffer(buffer: &Bound<'_, PyBytes>) -> PyResult<Self> {
|
||||||
let tokenizer = serde_json::from_slice(buffer.as_bytes()).map_err(|e| {
|
let tokenizer = serde_json::from_slice(buffer.as_bytes()).map_err(|e| {
|
||||||
exceptions::PyValueError::new_err(format!(
|
exceptions::PyValueError::new_err(format!(
|
||||||
"Cannot instantiate Tokenizer from buffer: {}",
|
"Cannot instantiate Tokenizer from buffer: {}",
|
||||||
@ -591,18 +591,18 @@ impl PyTokenizer {
|
|||||||
auth_token: Option<String>,
|
auth_token: Option<String>,
|
||||||
) -> PyResult<Self> {
|
) -> PyResult<Self> {
|
||||||
let path = Python::with_gil(|py| -> PyResult<String> {
|
let path = Python::with_gil(|py| -> PyResult<String> {
|
||||||
let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?;
|
let huggingface_hub = PyModule::import_bound(py, intern!(py, "huggingface_hub"))?;
|
||||||
let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?;
|
let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?;
|
||||||
let kwargs = [
|
let kwargs = [
|
||||||
(intern!(py, "repo_id"), identifier),
|
(intern!(py, "repo_id"), identifier),
|
||||||
(intern!(py, "filename"), "tokenizer.json"),
|
(intern!(py, "filename"), "tokenizer.json"),
|
||||||
(intern!(py, "revision"), &revision),
|
(intern!(py, "revision"), &revision),
|
||||||
]
|
]
|
||||||
.into_py_dict(py);
|
.into_py_dict_bound(py);
|
||||||
if let Some(auth_token) = auth_token {
|
if let Some(auth_token) = auth_token {
|
||||||
kwargs.set_item(intern!(py, "token"), auth_token)?;
|
kwargs.set_item(intern!(py, "token"), auth_token)?;
|
||||||
}
|
}
|
||||||
let path: String = hf_hub_download.call((), Some(kwargs))?.extract()?;
|
let path: String = hf_hub_download.call((), Some(&kwargs))?.extract()?;
|
||||||
Ok(path)
|
Ok(path)
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
@ -712,7 +712,11 @@ impl PyTokenizer {
|
|||||||
#[pyo3(
|
#[pyo3(
|
||||||
text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
|
text_signature = "(self, max_length, stride=0, strategy='longest_first', direction='right')"
|
||||||
)]
|
)]
|
||||||
fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> {
|
fn enable_truncation(
|
||||||
|
&mut self,
|
||||||
|
max_length: usize,
|
||||||
|
kwargs: Option<&Bound<'_, PyDict>>,
|
||||||
|
) -> PyResult<()> {
|
||||||
let mut params = TruncationParams {
|
let mut params = TruncationParams {
|
||||||
max_length,
|
max_length,
|
||||||
..Default::default()
|
..Default::default()
|
||||||
@ -777,9 +781,9 @@ impl PyTokenizer {
|
|||||||
/// (:obj:`dict`, `optional`):
|
/// (:obj:`dict`, `optional`):
|
||||||
/// A dict with the current truncation parameters if truncation is enabled
|
/// A dict with the current truncation parameters if truncation is enabled
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyDict>> {
|
fn get_truncation<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> {
|
||||||
self.tokenizer.get_truncation().map_or(Ok(None), |params| {
|
self.tokenizer.get_truncation().map_or(Ok(None), |params| {
|
||||||
let dict = PyDict::new(py);
|
let dict = PyDict::new_bound(py);
|
||||||
|
|
||||||
dict.set_item("max_length", params.max_length)?;
|
dict.set_item("max_length", params.max_length)?;
|
||||||
dict.set_item("stride", params.stride)?;
|
dict.set_item("stride", params.stride)?;
|
||||||
@ -817,7 +821,7 @@ impl PyTokenizer {
|
|||||||
#[pyo3(
|
#[pyo3(
|
||||||
text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
|
text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"
|
||||||
)]
|
)]
|
||||||
fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> {
|
fn enable_padding(&mut self, kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<()> {
|
||||||
let mut params = PaddingParams::default();
|
let mut params = PaddingParams::default();
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
@ -887,9 +891,9 @@ impl PyTokenizer {
|
|||||||
/// (:obj:`dict`, `optional`):
|
/// (:obj:`dict`, `optional`):
|
||||||
/// A dict with the current padding parameters if padding is enabled
|
/// A dict with the current padding parameters if padding is enabled
|
||||||
#[getter]
|
#[getter]
|
||||||
fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<&'py PyDict>> {
|
fn get_padding<'py>(&self, py: Python<'py>) -> PyResult<Option<Bound<'py, PyDict>>> {
|
||||||
self.tokenizer.get_padding().map_or(Ok(None), |params| {
|
self.tokenizer.get_padding().map_or(Ok(None), |params| {
|
||||||
let dict = PyDict::new(py);
|
let dict = PyDict::new_bound(py);
|
||||||
|
|
||||||
dict.set_item(
|
dict.set_item(
|
||||||
"length",
|
"length",
|
||||||
@ -948,8 +952,8 @@ impl PyTokenizer {
|
|||||||
)]
|
)]
|
||||||
fn encode(
|
fn encode(
|
||||||
&self,
|
&self,
|
||||||
sequence: &PyAny,
|
sequence: &Bound<'_, PyAny>,
|
||||||
pair: Option<&PyAny>,
|
pair: Option<&Bound<'_, PyAny>>,
|
||||||
is_pretokenized: bool,
|
is_pretokenized: bool,
|
||||||
add_special_tokens: bool,
|
add_special_tokens: bool,
|
||||||
) -> PyResult<PyEncoding> {
|
) -> PyResult<PyEncoding> {
|
||||||
@ -1141,7 +1145,7 @@ impl PyTokenizer {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
||||||
#[pyo3(text_signature = "(self, tokens)")]
|
#[pyo3(text_signature = "(self, tokens)")]
|
||||||
fn add_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
fn add_tokens(&mut self, tokens: &Bound<'_, PyList>) -> PyResult<usize> {
|
||||||
let tokens = tokens
|
let tokens = tokens
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|token| {
|
.map(|token| {
|
||||||
@ -1178,7 +1182,7 @@ impl PyTokenizer {
|
|||||||
/// Returns:
|
/// Returns:
|
||||||
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
/// :obj:`int`: The number of tokens that were created in the vocabulary
|
||||||
#[pyo3(text_signature = "(self, tokens)")]
|
#[pyo3(text_signature = "(self, tokens)")]
|
||||||
fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult<usize> {
|
fn add_special_tokens(&mut self, tokens: &Bound<'_, PyList>) -> PyResult<usize> {
|
||||||
let tokens = tokens
|
let tokens = tokens
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|token| {
|
.map(|token| {
|
||||||
@ -1251,7 +1255,7 @@ impl PyTokenizer {
|
|||||||
fn train_from_iterator(
|
fn train_from_iterator(
|
||||||
&mut self,
|
&mut self,
|
||||||
py: Python,
|
py: Python,
|
||||||
iterator: &PyAny,
|
iterator: &Bound<'_, PyAny>,
|
||||||
trainer: Option<&mut PyTrainer>,
|
trainer: Option<&mut PyTrainer>,
|
||||||
length: Option<usize>,
|
length: Option<usize>,
|
||||||
) -> PyResult<()> {
|
) -> PyResult<()> {
|
||||||
|
@ -2,7 +2,6 @@ use std::sync::{Arc, RwLock};
|
|||||||
|
|
||||||
use crate::models::PyModel;
|
use crate::models::PyModel;
|
||||||
use crate::tokenizer::PyAddedToken;
|
use crate::tokenizer::PyAddedToken;
|
||||||
use crate::utils::PyChar;
|
|
||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
@ -52,7 +51,7 @@ impl PyTrainer {
|
|||||||
e
|
e
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
Ok(PyBytes::new(py, data.as_bytes()).to_object(py))
|
Ok(PyBytes::new_bound(py, data.as_bytes()).to_object(py))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
fn __setstate__(&mut self, py: Python, state: PyObject) -> PyResult<()> {
|
||||||
@ -215,7 +214,7 @@ impl PyBpeTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &PyList) -> PyResult<()> {
|
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &Bound<'_, PyList>) -> PyResult<()> {
|
||||||
setter!(
|
setter!(
|
||||||
self_,
|
self_,
|
||||||
BpeTrainer,
|
BpeTrainer,
|
||||||
@ -269,12 +268,12 @@ impl PyBpeTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<PyChar>) {
|
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<char>) {
|
||||||
setter!(
|
setter!(
|
||||||
self_,
|
self_,
|
||||||
BpeTrainer,
|
BpeTrainer,
|
||||||
initial_alphabet,
|
initial_alphabet,
|
||||||
alphabet.into_iter().map(|c| c.0).collect()
|
alphabet.into_iter().collect()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -300,7 +299,7 @@ impl PyBpeTrainer {
|
|||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (**kwargs), text_signature = None)]
|
#[pyo3(signature = (**kwargs), text_signature = None)]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {
|
||||||
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
@ -429,7 +428,7 @@ impl PyWordPieceTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &PyList) -> PyResult<()> {
|
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &Bound<'_, PyList>) -> PyResult<()> {
|
||||||
setter!(
|
setter!(
|
||||||
self_,
|
self_,
|
||||||
WordPieceTrainer,
|
WordPieceTrainer,
|
||||||
@ -473,12 +472,12 @@ impl PyWordPieceTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<PyChar>) {
|
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<char>) {
|
||||||
setter!(
|
setter!(
|
||||||
self_,
|
self_,
|
||||||
WordPieceTrainer,
|
WordPieceTrainer,
|
||||||
@set_initial_alphabet,
|
@set_initial_alphabet,
|
||||||
alphabet.into_iter().map(|c| c.0).collect()
|
alphabet.into_iter().collect()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -507,7 +506,7 @@ impl PyWordPieceTrainer {
|
|||||||
signature = (** kwargs),
|
signature = (** kwargs),
|
||||||
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
|
text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"
|
||||||
)]
|
)]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {
|
||||||
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
@ -621,7 +620,7 @@ impl PyWordLevelTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &PyList) -> PyResult<()> {
|
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &Bound<'_, PyList>) -> PyResult<()> {
|
||||||
setter!(
|
setter!(
|
||||||
self_,
|
self_,
|
||||||
WordLevelTrainer,
|
WordLevelTrainer,
|
||||||
@ -647,7 +646,7 @@ impl PyWordLevelTrainer {
|
|||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[pyo3(signature = (**kwargs), text_signature = None)]
|
#[pyo3(signature = (**kwargs), text_signature = None)]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
pub fn new(kwargs: Option<&Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {
|
||||||
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
|
let mut builder = tk::models::wordlevel::WordLevelTrainer::builder();
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
@ -767,7 +766,7 @@ impl PyUnigramTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &PyList) -> PyResult<()> {
|
fn set_special_tokens(self_: PyRef<Self>, special_tokens: &Bound<'_, PyList>) -> PyResult<()> {
|
||||||
setter!(
|
setter!(
|
||||||
self_,
|
self_,
|
||||||
UnigramTrainer,
|
UnigramTrainer,
|
||||||
@ -801,12 +800,12 @@ impl PyUnigramTrainer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[setter]
|
#[setter]
|
||||||
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<PyChar>) {
|
fn set_initial_alphabet(self_: PyRef<Self>, alphabet: Vec<char>) {
|
||||||
setter!(
|
setter!(
|
||||||
self_,
|
self_,
|
||||||
UnigramTrainer,
|
UnigramTrainer,
|
||||||
initial_alphabet,
|
initial_alphabet,
|
||||||
alphabet.into_iter().map(|c| c.0).collect()
|
alphabet.into_iter().collect()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -815,7 +814,7 @@ impl PyUnigramTrainer {
|
|||||||
signature = (**kwargs),
|
signature = (**kwargs),
|
||||||
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
|
text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens=[], shrinking_factor=0.75, unk_token=None, max_piece_length=16, n_sub_iterations=2)"
|
||||||
)]
|
)]
|
||||||
pub fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyTrainer)> {
|
pub fn new(kwargs: Option<Bound<'_, PyDict>>) -> PyResult<(Self, PyTrainer)> {
|
||||||
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
@ -874,7 +873,7 @@ impl PyUnigramTrainer {
|
|||||||
|
|
||||||
/// Trainers Module
|
/// Trainers Module
|
||||||
#[pymodule]
|
#[pymodule]
|
||||||
pub fn trainers(_py: Python, m: &PyModule) -> PyResult<()> {
|
pub fn trainers(m: &Bound<'_, PyModule>) -> PyResult<()> {
|
||||||
m.add_class::<PyTrainer>()?;
|
m.add_class::<PyTrainer>()?;
|
||||||
m.add_class::<PyBpeTrainer>()?;
|
m.add_class::<PyBpeTrainer>()?;
|
||||||
m.add_class::<PyWordPieceTrainer>()?;
|
m.add_class::<PyWordPieceTrainer>()?;
|
||||||
@ -893,7 +892,7 @@ mod tests {
|
|||||||
Python::with_gil(|py| {
|
Python::with_gil(|py| {
|
||||||
let py_trainer = PyTrainer::new(Arc::new(RwLock::new(BpeTrainer::default().into())));
|
let py_trainer = PyTrainer::new(Arc::new(RwLock::new(BpeTrainer::default().into())));
|
||||||
let py_bpe = py_trainer.get_as_subtype(py).unwrap();
|
let py_bpe = py_trainer.get_as_subtype(py).unwrap();
|
||||||
assert_eq!("BpeTrainer", py_bpe.as_ref(py).get_type().name().unwrap());
|
assert_eq!("BpeTrainer", py_bpe.bind(py).get_type().qualname().unwrap());
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -50,7 +50,7 @@ pub struct PyBufferedIterator<T, F> {
|
|||||||
|
|
||||||
impl<T, F, I> PyBufferedIterator<T, F>
|
impl<T, F, I> PyBufferedIterator<T, F>
|
||||||
where
|
where
|
||||||
F: Fn(&PyAny) -> I,
|
F: Fn(Bound<'_, PyAny>) -> I,
|
||||||
I: IntoIterator<Item = PyResult<T>>,
|
I: IntoIterator<Item = PyResult<T>>,
|
||||||
{
|
{
|
||||||
/// Create a new PyBufferedIterator using the provided Python object.
|
/// Create a new PyBufferedIterator using the provided Python object.
|
||||||
@ -62,10 +62,10 @@ where
|
|||||||
///
|
///
|
||||||
/// The `buffer_size` represents the number of items that we buffer before we
|
/// The `buffer_size` represents the number of items that we buffer before we
|
||||||
/// need to acquire the GIL again.
|
/// need to acquire the GIL again.
|
||||||
pub fn new(iter: &PyAny, converter: F, buffer_size: usize) -> PyResult<Self> {
|
pub fn new(iter: &Bound<'_, PyAny>, converter: F, buffer_size: usize) -> PyResult<Self> {
|
||||||
let py = iter.py();
|
let py = iter.py();
|
||||||
let iter: Py<PyAny> = unsafe {
|
let iter: Py<PyAny> = unsafe {
|
||||||
py.from_borrowed_ptr_or_err::<PyAny>(pyo3::ffi::PyObject_GetIter(iter.as_ptr()))?
|
Bound::from_borrowed_ptr_or_err(py, pyo3::ffi::PyObject_GetIter(iter.as_ptr()))?
|
||||||
.to_object(py)
|
.to_object(py)
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -89,9 +89,10 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
match unsafe {
|
match unsafe {
|
||||||
py.from_owned_ptr_or_opt::<PyAny>(pyo3::ffi::PyIter_Next(
|
Bound::from_owned_ptr_or_opt(
|
||||||
self.iter.as_ref().unwrap().as_ref(py).as_ptr(),
|
py,
|
||||||
))
|
pyo3::ffi::PyIter_Next(self.iter.as_ref().unwrap().bind(py).as_ptr()),
|
||||||
|
)
|
||||||
} {
|
} {
|
||||||
Some(obj) => self.buffer.extend((self.converter)(obj)),
|
Some(obj) => self.buffer.extend((self.converter)(obj)),
|
||||||
None => {
|
None => {
|
||||||
@ -112,7 +113,7 @@ where
|
|||||||
|
|
||||||
impl<T, F, I> Iterator for PyBufferedIterator<T, F>
|
impl<T, F, I> Iterator for PyBufferedIterator<T, F>
|
||||||
where
|
where
|
||||||
F: Fn(&PyAny) -> I,
|
F: Fn(Bound<'_, PyAny>) -> I,
|
||||||
I: IntoIterator<Item = PyResult<T>>,
|
I: IntoIterator<Item = PyResult<T>>,
|
||||||
{
|
{
|
||||||
type Item = PyResult<T>;
|
type Item = PyResult<T>;
|
||||||
|
@ -1,6 +1,3 @@
|
|||||||
use pyo3::exceptions;
|
|
||||||
use pyo3::prelude::*;
|
|
||||||
use pyo3::types::*;
|
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
@ -14,25 +11,6 @@ pub use normalization::*;
|
|||||||
pub use pretokenization::*;
|
pub use pretokenization::*;
|
||||||
pub use regex::*;
|
pub use regex::*;
|
||||||
|
|
||||||
// PyChar
|
|
||||||
// This type is a temporary hack to accept `char` as argument
|
|
||||||
// To be removed once https://github.com/PyO3/pyo3/pull/1282 has been released
|
|
||||||
pub struct PyChar(pub char);
|
|
||||||
|
|
||||||
impl FromPyObject<'_> for PyChar {
|
|
||||||
fn extract(obj: &PyAny) -> PyResult<Self> {
|
|
||||||
let s = <PyString as PyTryFrom<'_>>::try_from(obj)?.to_str()?;
|
|
||||||
let mut iter = s.chars();
|
|
||||||
if let (Some(ch), None) = (iter.next(), iter.next()) {
|
|
||||||
Ok(Self(ch))
|
|
||||||
} else {
|
|
||||||
Err(exceptions::PyValueError::new_err(
|
|
||||||
"expected a string of length 1",
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// RefMut utils
|
// RefMut utils
|
||||||
|
|
||||||
pub trait DestroyPtr {
|
pub trait DestroyPtr {
|
||||||
|
@ -9,15 +9,15 @@ use tk::pattern::Pattern;
|
|||||||
|
|
||||||
/// Represents a Pattern as used by `NormalizedString`
|
/// Represents a Pattern as used by `NormalizedString`
|
||||||
#[derive(Clone, FromPyObject)]
|
#[derive(Clone, FromPyObject)]
|
||||||
pub enum PyPattern<'p> {
|
pub enum PyPattern {
|
||||||
#[pyo3(annotation = "str")]
|
#[pyo3(annotation = "str")]
|
||||||
Str(&'p str),
|
Str(String),
|
||||||
#[pyo3(annotation = "tokenizers.Regex")]
|
#[pyo3(annotation = "tokenizers.Regex")]
|
||||||
Regex(Py<PyRegex>),
|
Regex(Py<PyRegex>),
|
||||||
// TODO: Add the compatibility for Fn(char) -> bool
|
// TODO: Add the compatibility for Fn(char) -> bool
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Pattern for PyPattern<'_> {
|
impl Pattern for PyPattern {
|
||||||
fn find_matches(&self, inside: &str) -> tk::Result<Vec<(tk::Offsets, bool)>> {
|
fn find_matches(&self, inside: &str) -> tk::Result<Vec<(tk::Offsets, bool)>> {
|
||||||
match self {
|
match self {
|
||||||
PyPattern::Str(s) => {
|
PyPattern::Str(s) => {
|
||||||
@ -35,8 +35,8 @@ impl Pattern for PyPattern<'_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<PyPattern<'_>> for tk::normalizers::replace::ReplacePattern {
|
impl From<PyPattern> for tk::normalizers::replace::ReplacePattern {
|
||||||
fn from(pattern: PyPattern<'_>) -> Self {
|
fn from(pattern: PyPattern) -> Self {
|
||||||
match pattern {
|
match pattern {
|
||||||
PyPattern::Str(s) => Self::String(s.to_owned()),
|
PyPattern::Str(s) => Self::String(s.to_owned()),
|
||||||
PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())),
|
PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())),
|
||||||
@ -44,8 +44,8 @@ impl From<PyPattern<'_>> for tk::normalizers::replace::ReplacePattern {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<PyPattern<'_>> for tk::pre_tokenizers::split::SplitPattern {
|
impl From<PyPattern> for tk::pre_tokenizers::split::SplitPattern {
|
||||||
fn from(pattern: PyPattern<'_>) -> Self {
|
fn from(pattern: PyPattern) -> Self {
|
||||||
match pattern {
|
match pattern {
|
||||||
PyPattern::Str(s) => Self::String(s.to_owned()),
|
PyPattern::Str(s) => Self::String(s.to_owned()),
|
||||||
PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())),
|
PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())),
|
||||||
@ -117,7 +117,7 @@ impl From<PySplitDelimiterBehavior> for SplitDelimiterBehavior {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn filter(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> {
|
fn filter(normalized: &mut NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
let err = "`filter` expect a callable with the signature: `fn(char) -> bool`";
|
let err = "`filter` expect a callable with the signature: `fn(char) -> bool`";
|
||||||
|
|
||||||
if !func.is_callable() {
|
if !func.is_callable() {
|
||||||
@ -134,7 +134,7 @@ fn filter(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn for_each(normalized: &NormalizedString, func: &PyAny) -> PyResult<()> {
|
fn for_each(normalized: &NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
let err = "`for_each` expect a callable with the signature: `fn(char)`";
|
let err = "`for_each` expect a callable with the signature: `fn(char)`";
|
||||||
|
|
||||||
if !func.is_callable() {
|
if !func.is_callable() {
|
||||||
@ -148,14 +148,14 @@ fn for_each(normalized: &NormalizedString, func: &PyAny) -> PyResult<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn map(normalized: &mut NormalizedString, func: &PyAny) -> PyResult<()> {
|
fn map(normalized: &mut NormalizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
let err = "`map` expect a callable with the signature: `fn(char) -> char`";
|
let err = "`map` expect a callable with the signature: `fn(char) -> char`";
|
||||||
|
|
||||||
if !func.is_callable() {
|
if !func.is_callable() {
|
||||||
Err(exceptions::PyTypeError::new_err(err))
|
Err(exceptions::PyTypeError::new_err(err))
|
||||||
} else {
|
} else {
|
||||||
normalized.map(|c| {
|
normalized.map(|c| {
|
||||||
let c: &str = func
|
let c: String = func
|
||||||
.call1((c.to_string(),))
|
.call1((c.to_string(),))
|
||||||
.expect(err)
|
.expect(err)
|
||||||
.extract()
|
.extract()
|
||||||
@ -296,13 +296,13 @@ impl PyNormalizedString {
|
|||||||
|
|
||||||
/// Filter each character of the string using the given func
|
/// Filter each character of the string using the given func
|
||||||
#[pyo3(text_signature = "(self, func)")]
|
#[pyo3(text_signature = "(self, func)")]
|
||||||
fn filter(&mut self, func: &PyAny) -> PyResult<()> {
|
fn filter(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
filter(&mut self.normalized, func)
|
filter(&mut self.normalized, func)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Calls the given function for each character of the string
|
/// Calls the given function for each character of the string
|
||||||
#[pyo3(text_signature = "(self, func)")]
|
#[pyo3(text_signature = "(self, func)")]
|
||||||
fn for_each(&self, func: &PyAny) -> PyResult<()> {
|
fn for_each(&self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
for_each(&self.normalized, func)
|
for_each(&self.normalized, func)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -311,7 +311,7 @@ impl PyNormalizedString {
|
|||||||
/// Replaces each character of the string using the returned value. Each
|
/// Replaces each character of the string using the returned value. Each
|
||||||
/// returned value **must** be a str of length 1 (ie a character).
|
/// returned value **must** be a str of length 1 (ie a character).
|
||||||
#[pyo3(text_signature = "(self, func)")]
|
#[pyo3(text_signature = "(self, func)")]
|
||||||
fn map(&mut self, func: &PyAny) -> PyResult<()> {
|
fn map(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
map(&mut self.normalized, func)
|
map(&mut self.normalized, func)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -551,21 +551,21 @@ impl PyNormalizedStringRefMut {
|
|||||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)?
|
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)?
|
||||||
}
|
}
|
||||||
|
|
||||||
fn filter(&mut self, func: &PyAny) -> PyResult<()> {
|
fn filter(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
self.inner
|
self.inner
|
||||||
.map_mut(|n| filter(n, func))
|
.map_mut(|n| filter(n, func))
|
||||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn for_each(&self, func: &PyAny) -> PyResult<()> {
|
fn for_each(&self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
self.inner
|
self.inner
|
||||||
.map(|n| for_each(n, func))
|
.map(|n| for_each(n, func))
|
||||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn map(&mut self, func: &PyAny) -> PyResult<()> {
|
fn map(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
self.inner
|
self.inner
|
||||||
.map_mut(|n| map(n, func))
|
.map_mut(|n| map(n, func))
|
||||||
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
.ok_or_else(PyNormalizedStringRefMut::destroyed_error)??;
|
||||||
|
@ -12,7 +12,7 @@ use crate::error::ToPyResult;
|
|||||||
use crate::token::PyToken;
|
use crate::token::PyToken;
|
||||||
use tk::{OffsetReferential, OffsetType, Offsets, PreTokenizedString, Token};
|
use tk::{OffsetReferential, OffsetType, Offsets, PreTokenizedString, Token};
|
||||||
|
|
||||||
fn split(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
fn split(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
if !func.is_callable() {
|
if !func.is_callable() {
|
||||||
Err(exceptions::PyTypeError::new_err(
|
Err(exceptions::PyTypeError::new_err(
|
||||||
"`split` expect a callable with the signature: \
|
"`split` expect a callable with the signature: \
|
||||||
@ -30,7 +30,7 @@ fn split(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn normalize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
fn normalize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
if !func.is_callable() {
|
if !func.is_callable() {
|
||||||
Err(exceptions::PyTypeError::new_err(
|
Err(exceptions::PyTypeError::new_err(
|
||||||
"`normalize` expect a callable with the signature: \
|
"`normalize` expect a callable with the signature: \
|
||||||
@ -46,7 +46,7 @@ fn normalize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tokenize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> {
|
fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
if !func.is_callable() {
|
if !func.is_callable() {
|
||||||
Err(exceptions::PyTypeError::new_err(
|
Err(exceptions::PyTypeError::new_err(
|
||||||
"`tokenize` expect a callable with the signature: \
|
"`tokenize` expect a callable with the signature: \
|
||||||
@ -183,7 +183,7 @@ impl PyPreTokenizedString {
|
|||||||
/// In order for the offsets to be tracked accurately, any returned `NormalizedString`
|
/// In order for the offsets to be tracked accurately, any returned `NormalizedString`
|
||||||
/// should come from calling either `.split` or `.slice` on the received one.
|
/// should come from calling either `.split` or `.slice` on the received one.
|
||||||
#[pyo3(text_signature = "(self, func)")]
|
#[pyo3(text_signature = "(self, func)")]
|
||||||
fn split(&mut self, func: &PyAny) -> PyResult<()> {
|
fn split(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
split(&mut self.pretok, func)
|
split(&mut self.pretok, func)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,7 +195,7 @@ impl PyPreTokenizedString {
|
|||||||
/// does not need to return anything, just calling the methods on the provided
|
/// does not need to return anything, just calling the methods on the provided
|
||||||
/// NormalizedString allow its modification.
|
/// NormalizedString allow its modification.
|
||||||
#[pyo3(text_signature = "(self, func)")]
|
#[pyo3(text_signature = "(self, func)")]
|
||||||
fn normalize(&mut self, func: &PyAny) -> PyResult<()> {
|
fn normalize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
normalize(&mut self.pretok, func)
|
normalize(&mut self.pretok, func)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -206,7 +206,7 @@ impl PyPreTokenizedString {
|
|||||||
/// The function used to tokenize each underlying split. This function must return
|
/// The function used to tokenize each underlying split. This function must return
|
||||||
/// a list of Token generated from the input str.
|
/// a list of Token generated from the input str.
|
||||||
#[pyo3(text_signature = "(self, func)")]
|
#[pyo3(text_signature = "(self, func)")]
|
||||||
fn tokenize(&mut self, func: &PyAny) -> PyResult<()> {
|
fn tokenize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
tokenize(&mut self.pretok, func)
|
tokenize(&mut self.pretok, func)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -289,19 +289,19 @@ impl PyPreTokenizedStringRefMut {
|
|||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyPreTokenizedStringRefMut {
|
impl PyPreTokenizedStringRefMut {
|
||||||
fn split(&mut self, func: &PyAny) -> PyResult<()> {
|
fn split(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
self.inner
|
self.inner
|
||||||
.map_mut(|pretok| split(pretok, func))
|
.map_mut(|pretok| split(pretok, func))
|
||||||
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
||||||
}
|
}
|
||||||
|
|
||||||
fn normalize(&mut self, func: &PyAny) -> PyResult<()> {
|
fn normalize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
self.inner
|
self.inner
|
||||||
.map_mut(|pretok| normalize(pretok, func))
|
.map_mut(|pretok| normalize(pretok, func))
|
||||||
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tokenize(&mut self, func: &PyAny) -> PyResult<()> {
|
fn tokenize(&mut self, func: &Bound<'_, PyAny>) -> PyResult<()> {
|
||||||
self.inner
|
self.inner
|
||||||
.map_mut(|pretok| tokenize(pretok, func))
|
.map_mut(|pretok| tokenize(pretok, func))
|
||||||
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
.ok_or_else(PyPreTokenizedStringRefMut::destroyed_error)?
|
||||||
|
@ -40,19 +40,19 @@ harness = false
|
|||||||
lazy_static = "1.4"
|
lazy_static = "1.4"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
onig = { version = "6.4", default-features = false, optional = true }
|
onig = { version = "6.4", default-features = false, optional = true }
|
||||||
regex = "1.9"
|
regex = "1.10"
|
||||||
regex-syntax = "0.8"
|
regex-syntax = "0.8"
|
||||||
rayon = "1.8"
|
rayon = "1.10"
|
||||||
rayon-cond = "0.3"
|
rayon-cond = "0.3"
|
||||||
serde = { version = "1.0", features = [ "derive" ] }
|
serde = { version = "1.0", features = [ "derive" ] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
unicode-normalization-alignments = "0.1"
|
unicode-normalization-alignments = "0.1"
|
||||||
unicode_categories = "0.1"
|
unicode_categories = "0.1"
|
||||||
unicode-segmentation = "1.10"
|
unicode-segmentation = "1.11"
|
||||||
indicatif = {version = "0.17", optional = true}
|
indicatif = {version = "0.17", optional = true}
|
||||||
itertools = "0.12"
|
itertools = "0.12"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
derive_builder = "0.13"
|
derive_builder = "0.20"
|
||||||
spm_precompiled = "0.1"
|
spm_precompiled = "0.1"
|
||||||
hf-hub = { version = "0.3.2", optional = true }
|
hf-hub = { version = "0.3.2", optional = true }
|
||||||
aho-corasick = "1.1"
|
aho-corasick = "1.1"
|
||||||
@ -62,7 +62,7 @@ thiserror = "1.0.49"
|
|||||||
fancy-regex = { version = "0.13", optional = true}
|
fancy-regex = { version = "0.13", optional = true}
|
||||||
getrandom = { version = "0.2.10" }
|
getrandom = { version = "0.2.10" }
|
||||||
esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
|
esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
|
||||||
monostate = "0.1.9"
|
monostate = "0.1.12"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["progressbar", "onig", "esaxx_fast"]
|
default = ["progressbar", "onig", "esaxx_fast"]
|
||||||
@ -73,7 +73,7 @@ unstable_wasm = ["fancy-regex", "getrandom/js"]
|
|||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
criterion = "0.5"
|
criterion = "0.5"
|
||||||
tempfile = "3.8"
|
tempfile = "3.10"
|
||||||
assert_approx_eq = "1.1"
|
assert_approx_eq = "1.1"
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
|
Reference in New Issue
Block a user