mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Testing ABI3 wheels to reduce number of wheels (#1674)
* Testing ABI3 wheels to reduce number of wheels * No need for py-clone anymore. * Upgrade python versions. * Remove those flakes. * Promoting new CI + Fixing secret.
This commit is contained in:
@ -14,7 +14,7 @@ serde = { version = "1.0", features = ["rc", "derive"] }
|
||||
serde_json = "1.0"
|
||||
libc = "0.2"
|
||||
env_logger = "0.11"
|
||||
pyo3 = { version = "0.22", features = ["py-clone"] }
|
||||
pyo3 = { version = "0.22", features = ["abi3", "abi3-py39"] }
|
||||
numpy = "0.22"
|
||||
ndarray = "0.15"
|
||||
itertools = "0.12"
|
||||
@ -24,7 +24,7 @@ path = "../../tokenizers"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.10"
|
||||
pyo3 = { version = "0.22", features = ["auto-initialize", "py-clone"] }
|
||||
pyo3 = { version = "0.22", features = ["auto-initialize"] }
|
||||
|
||||
[features]
|
||||
defaut = ["pyo3/extension-module"]
|
||||
|
@ -488,7 +488,6 @@ impl PySequenceDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct CustomDecoder {
|
||||
inner: PyObject,
|
||||
}
|
||||
|
@ -399,11 +399,11 @@ impl PyEncoding {
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"direction" => {
|
||||
let value: &str = value.extract()?;
|
||||
direction = match value {
|
||||
let value: String = value.extract()?;
|
||||
direction = match value.as_ref() {
|
||||
"left" => Ok(PaddingDirection::Left),
|
||||
"right" => Ok(PaddingDirection::Right),
|
||||
other => Err(PyError(format!(
|
||||
|
@ -276,8 +276,8 @@ impl PyBPE {
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"cache_capacity" => builder = builder.cache_capacity(value.extract()?),
|
||||
"dropout" => {
|
||||
if let Some(dropout) = value.extract()? {
|
||||
@ -581,8 +581,8 @@ impl PyWordPiece {
|
||||
) -> PyResult<(Self, PyModel)> {
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"unk_token" => {
|
||||
builder = builder.unk_token(val.extract()?);
|
||||
}
|
||||
|
@ -184,9 +184,8 @@ macro_rules! getter {
|
||||
let super_ = $self.as_ref();
|
||||
if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer {
|
||||
let wrapper = norm.read().unwrap();
|
||||
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone()
|
||||
{
|
||||
o.$name
|
||||
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (&*wrapper) {
|
||||
o.$name.clone()
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
@ -538,7 +537,7 @@ impl PyReplace {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug)]
|
||||
pub(crate) struct CustomNormalizer {
|
||||
inner: PyObject,
|
||||
}
|
||||
@ -581,7 +580,7 @@ impl<'de> Deserialize<'de> for CustomNormalizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize)]
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub(crate) enum PyNormalizerWrapper {
|
||||
Custom(CustomNormalizer),
|
||||
|
@ -618,7 +618,6 @@ impl PyUnicodeScripts {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct CustomPreTokenizer {
|
||||
inner: PyObject,
|
||||
}
|
||||
@ -662,7 +661,7 @@ impl<'de> Deserialize<'de> for CustomPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Deserialize)]
|
||||
#[derive(Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub(crate) enum PyPreTokenizerWrapper {
|
||||
Custom(CustomPreTokenizer),
|
||||
|
@ -313,7 +313,7 @@ impl From<PyTemplate> for Template {
|
||||
|
||||
impl FromPyObject<'_> for PyTemplate {
|
||||
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||
if let Ok(s) = ob.extract::<&str>() {
|
||||
if let Ok(s) = ob.extract::<String>() {
|
||||
Ok(Self(
|
||||
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
||||
))
|
||||
|
@ -136,8 +136,8 @@ impl PyAddedToken {
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"single_word" => token.single_word = Some(value.extract()?),
|
||||
"lstrip" => token.lstrip = Some(value.extract()?),
|
||||
"rstrip" => token.rstrip = Some(value.extract()?),
|
||||
@ -159,8 +159,8 @@ impl PyAddedToken {
|
||||
match state.downcast_bound::<PyDict>(py) {
|
||||
Ok(state) => {
|
||||
for (key, value) in state {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"content" => self.content = value.extract()?,
|
||||
"single_word" => self.single_word = Some(value.extract()?),
|
||||
"lstrip" => self.lstrip = Some(value.extract()?),
|
||||
@ -287,7 +287,7 @@ impl FromPyObject<'_> for PyArrayUnicode {
|
||||
}
|
||||
let arr = ob.as_ptr() as *mut npyffi::PyArrayObject;
|
||||
// SAFETY Getting all the metadata about the numpy array to check its sanity
|
||||
let (type_num, elsize, alignment, data, nd, flags) = unsafe {
|
||||
let (type_num, elsize, _alignment, data, nd, flags) = unsafe {
|
||||
let desc = (*arr).descr;
|
||||
(
|
||||
(*desc).type_num,
|
||||
@ -323,15 +323,16 @@ impl FromPyObject<'_> for PyArrayUnicode {
|
||||
let seq = (0..n_elem)
|
||||
.map(|i| {
|
||||
let bytes = &all_bytes[i * elsize..(i + 1) * elsize];
|
||||
let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
|
||||
pyo3::ffi::PyUnicode_4BYTE_KIND as _,
|
||||
bytes.as_ptr() as *const _,
|
||||
elsize as isize / alignment as isize,
|
||||
);
|
||||
let py = ob.py();
|
||||
let obj = PyObject::from_owned_ptr(py, unicode);
|
||||
let s = obj.downcast_bound::<PyString>(py)?;
|
||||
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
||||
Ok(std::str::from_utf8(bytes)?.to_owned())
|
||||
// let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
|
||||
// pyo3::ffi::PyUnicode_4BYTE_KIND as _,
|
||||
// bytes.as_ptr() as *const _,
|
||||
// elsize as isize / alignment as isize,
|
||||
// );
|
||||
// let py = ob.py();
|
||||
// let obj = PyObject::from_owned_ptr(py, unicode);
|
||||
// let s = obj.downcast_bound::<PyString>(py)?;
|
||||
// Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
||||
})
|
||||
.collect::<PyResult<Vec<_>>>()?;
|
||||
|
||||
@ -736,12 +737,12 @@ impl PyTokenizer {
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"stride" => params.stride = value.extract()?,
|
||||
"strategy" => {
|
||||
let value: &str = value.extract()?;
|
||||
params.strategy = match value {
|
||||
let value: String = value.extract()?;
|
||||
params.strategy = match value.as_ref() {
|
||||
"longest_first" => Ok(TruncationStrategy::LongestFirst),
|
||||
"only_first" => Ok(TruncationStrategy::OnlyFirst),
|
||||
"only_second" => Ok(TruncationStrategy::OnlySecond),
|
||||
@ -754,8 +755,8 @@ impl PyTokenizer {
|
||||
}?
|
||||
}
|
||||
"direction" => {
|
||||
let value: &str = value.extract()?;
|
||||
params.direction = match value {
|
||||
let value: String = value.extract()?;
|
||||
params.direction = match value.as_ref() {
|
||||
"left" => Ok(TruncationDirection::Left),
|
||||
"right" => Ok(TruncationDirection::Right),
|
||||
_ => Err(PyError(format!(
|
||||
@ -838,11 +839,11 @@ impl PyTokenizer {
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"direction" => {
|
||||
let value: &str = value.extract()?;
|
||||
params.direction = match value {
|
||||
let value: String = value.extract()?;
|
||||
params.direction = match value.as_ref() {
|
||||
"left" => Ok(PaddingDirection::Left),
|
||||
"right" => Ok(PaddingDirection::Right),
|
||||
other => Err(PyError(format!(
|
||||
@ -1341,7 +1342,7 @@ impl PyTokenizer {
|
||||
// - An iterator, to allow batching
|
||||
// - A string
|
||||
if let Ok(s) = element.downcast::<PyString>() {
|
||||
itertools::Either::Right(std::iter::once(s.to_str().map(|s| s.to_owned())))
|
||||
itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned())))
|
||||
} else {
|
||||
match element.iter() {
|
||||
Ok(iter) => itertools::Either::Left(
|
||||
|
@ -313,8 +313,8 @@ impl PyBpeTrainer {
|
||||
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"vocab_size" => builder = builder.vocab_size(val.extract()?),
|
||||
"min_frequency" => builder = builder.min_frequency(val.extract()?),
|
||||
"show_progress" => builder = builder.show_progress(val.extract()?),
|
||||
@ -520,8 +520,8 @@ impl PyWordPieceTrainer {
|
||||
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"vocab_size" => builder = builder.vocab_size(val.extract()?),
|
||||
"min_frequency" => builder = builder.min_frequency(val.extract()?),
|
||||
"show_progress" => builder = builder.show_progress(val.extract()?),
|
||||
@ -661,8 +661,8 @@ impl PyWordLevelTrainer {
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"vocab_size" => {
|
||||
builder.vocab_size(val.extract()?);
|
||||
}
|
||||
@ -828,8 +828,8 @@ impl PyUnigramTrainer {
|
||||
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
let key: String = key.extract()?;
|
||||
match key.as_ref() {
|
||||
"vocab_size" => builder.vocab_size(val.extract()?),
|
||||
"show_progress" => builder.show_progress(val.extract()?),
|
||||
"n_sub_iterations" => builder.n_sub_iterations(val.extract()?),
|
||||
|
@ -8,7 +8,7 @@ use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehav
|
||||
use tk::pattern::Pattern;
|
||||
|
||||
/// Represents a Pattern as used by `NormalizedString`
|
||||
#[derive(Clone, FromPyObject)]
|
||||
#[derive(FromPyObject)]
|
||||
pub enum PyPattern {
|
||||
#[pyo3(annotation = "str")]
|
||||
Str(String),
|
||||
@ -95,9 +95,9 @@ pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior);
|
||||
|
||||
impl FromPyObject<'_> for PySplitDelimiterBehavior {
|
||||
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||
let s = obj.extract::<&str>()?;
|
||||
let s = obj.extract::<String>()?;
|
||||
|
||||
Ok(Self(match s {
|
||||
Ok(Self(match s.as_ref() {
|
||||
"removed" => Ok(SplitDelimiterBehavior::Removed),
|
||||
"isolated" => Ok(SplitDelimiterBehavior::Isolated),
|
||||
"merged_with_previous" => Ok(SplitDelimiterBehavior::MergedWithPrevious),
|
||||
|
@ -70,9 +70,9 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResul
|
||||
pub struct PyOffsetReferential(OffsetReferential);
|
||||
impl FromPyObject<'_> for PyOffsetReferential {
|
||||
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||
let s = obj.extract::<&str>()?;
|
||||
let s = obj.extract::<String>()?;
|
||||
|
||||
Ok(Self(match s {
|
||||
Ok(Self(match s.as_ref() {
|
||||
"original" => Ok(OffsetReferential::Original),
|
||||
"normalized" => Ok(OffsetReferential::Normalized),
|
||||
_ => Err(exceptions::PyValueError::new_err(
|
||||
@ -86,9 +86,9 @@ impl FromPyObject<'_> for PyOffsetReferential {
|
||||
pub struct PyOffsetType(OffsetType);
|
||||
impl FromPyObject<'_> for PyOffsetType {
|
||||
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||
let s = obj.extract::<&str>()?;
|
||||
let s = obj.extract::<String>()?;
|
||||
|
||||
Ok(Self(match s {
|
||||
Ok(Self(match s.as_ref() {
|
||||
"byte" => Ok(OffsetType::Byte),
|
||||
"char" => Ok(OffsetType::Char),
|
||||
_ => Err(exceptions::PyValueError::new_err(
|
||||
|
Reference in New Issue
Block a user