Testing ABI3 wheels to reduce number of wheels (#1674)

* Testing ABI3 wheels to reduce number of wheels

* No need for py-clone  anymore.

* Upgrade python versions.

* Remove those flakes.

* Promoting new CI + Fixing secret.
This commit is contained in:
Nicolas Patry
2024-11-15 13:02:22 +08:00
committed by GitHub
parent 5aa9f6cff0
commit f4c9fd7f40
14 changed files with 246 additions and 66 deletions

View File

@ -14,7 +14,7 @@ serde = { version = "1.0", features = ["rc", "derive"] }
serde_json = "1.0"
libc = "0.2"
env_logger = "0.11"
pyo3 = { version = "0.22", features = ["py-clone"] }
pyo3 = { version = "0.22", features = ["abi3", "abi3-py39"] }
numpy = "0.22"
ndarray = "0.15"
itertools = "0.12"
@ -24,7 +24,7 @@ path = "../../tokenizers"
[dev-dependencies]
tempfile = "3.10"
pyo3 = { version = "0.22", features = ["auto-initialize", "py-clone"] }
pyo3 = { version = "0.22", features = ["auto-initialize"] }
[features]
defaut = ["pyo3/extension-module"]

View File

@ -488,7 +488,6 @@ impl PySequenceDecoder {
}
}
#[derive(Clone)]
pub(crate) struct CustomDecoder {
inner: PyObject,
}

View File

@ -399,11 +399,11 @@ impl PyEncoding {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"direction" => {
let value: &str = value.extract()?;
direction = match value {
let value: String = value.extract()?;
direction = match value.as_ref() {
"left" => Ok(PaddingDirection::Left),
"right" => Ok(PaddingDirection::Right),
other => Err(PyError(format!(

View File

@ -276,8 +276,8 @@ impl PyBPE {
) -> PyResult<(Self, PyModel)> {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"cache_capacity" => builder = builder.cache_capacity(value.extract()?),
"dropout" => {
if let Some(dropout) = value.extract()? {
@ -581,8 +581,8 @@ impl PyWordPiece {
) -> PyResult<(Self, PyModel)> {
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"unk_token" => {
builder = builder.unk_token(val.extract()?);
}

View File

@ -184,9 +184,8 @@ macro_rules! getter {
let super_ = $self.as_ref();
if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer {
let wrapper = norm.read().unwrap();
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone()
{
o.$name
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (&*wrapper) {
o.$name.clone()
} else {
unreachable!()
}
@ -538,7 +537,7 @@ impl PyReplace {
}
}
#[derive(Debug, Clone)]
#[derive(Debug)]
pub(crate) struct CustomNormalizer {
inner: PyObject,
}
@ -581,7 +580,7 @@ impl<'de> Deserialize<'de> for CustomNormalizer {
}
}
#[derive(Debug, Clone, Deserialize)]
#[derive(Debug, Deserialize)]
#[serde(untagged)]
pub(crate) enum PyNormalizerWrapper {
Custom(CustomNormalizer),

View File

@ -618,7 +618,6 @@ impl PyUnicodeScripts {
}
}
#[derive(Clone)]
pub(crate) struct CustomPreTokenizer {
inner: PyObject,
}
@ -662,7 +661,7 @@ impl<'de> Deserialize<'de> for CustomPreTokenizer {
}
}
#[derive(Clone, Deserialize)]
#[derive(Deserialize)]
#[serde(untagged)]
pub(crate) enum PyPreTokenizerWrapper {
Custom(CustomPreTokenizer),

View File

@ -313,7 +313,7 @@ impl From<PyTemplate> for Template {
impl FromPyObject<'_> for PyTemplate {
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
if let Ok(s) = ob.extract::<&str>() {
if let Ok(s) = ob.extract::<String>() {
Ok(Self(
s.try_into().map_err(exceptions::PyValueError::new_err)?,
))

View File

@ -136,8 +136,8 @@ impl PyAddedToken {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"single_word" => token.single_word = Some(value.extract()?),
"lstrip" => token.lstrip = Some(value.extract()?),
"rstrip" => token.rstrip = Some(value.extract()?),
@ -159,8 +159,8 @@ impl PyAddedToken {
match state.downcast_bound::<PyDict>(py) {
Ok(state) => {
for (key, value) in state {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"content" => self.content = value.extract()?,
"single_word" => self.single_word = Some(value.extract()?),
"lstrip" => self.lstrip = Some(value.extract()?),
@ -287,7 +287,7 @@ impl FromPyObject<'_> for PyArrayUnicode {
}
let arr = ob.as_ptr() as *mut npyffi::PyArrayObject;
// SAFETY Getting all the metadata about the numpy array to check its sanity
let (type_num, elsize, alignment, data, nd, flags) = unsafe {
let (type_num, elsize, _alignment, data, nd, flags) = unsafe {
let desc = (*arr).descr;
(
(*desc).type_num,
@ -323,15 +323,16 @@ impl FromPyObject<'_> for PyArrayUnicode {
let seq = (0..n_elem)
.map(|i| {
let bytes = &all_bytes[i * elsize..(i + 1) * elsize];
let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
pyo3::ffi::PyUnicode_4BYTE_KIND as _,
bytes.as_ptr() as *const _,
elsize as isize / alignment as isize,
);
let py = ob.py();
let obj = PyObject::from_owned_ptr(py, unicode);
let s = obj.downcast_bound::<PyString>(py)?;
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
Ok(std::str::from_utf8(bytes)?.to_owned())
// let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
// pyo3::ffi::PyUnicode_4BYTE_KIND as _,
// bytes.as_ptr() as *const _,
// elsize as isize / alignment as isize,
// );
// let py = ob.py();
// let obj = PyObject::from_owned_ptr(py, unicode);
// let s = obj.downcast_bound::<PyString>(py)?;
// Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
})
.collect::<PyResult<Vec<_>>>()?;
@ -736,12 +737,12 @@ impl PyTokenizer {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"stride" => params.stride = value.extract()?,
"strategy" => {
let value: &str = value.extract()?;
params.strategy = match value {
let value: String = value.extract()?;
params.strategy = match value.as_ref() {
"longest_first" => Ok(TruncationStrategy::LongestFirst),
"only_first" => Ok(TruncationStrategy::OnlyFirst),
"only_second" => Ok(TruncationStrategy::OnlySecond),
@ -754,8 +755,8 @@ impl PyTokenizer {
}?
}
"direction" => {
let value: &str = value.extract()?;
params.direction = match value {
let value: String = value.extract()?;
params.direction = match value.as_ref() {
"left" => Ok(TruncationDirection::Left),
"right" => Ok(TruncationDirection::Right),
_ => Err(PyError(format!(
@ -838,11 +839,11 @@ impl PyTokenizer {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"direction" => {
let value: &str = value.extract()?;
params.direction = match value {
let value: String = value.extract()?;
params.direction = match value.as_ref() {
"left" => Ok(PaddingDirection::Left),
"right" => Ok(PaddingDirection::Right),
other => Err(PyError(format!(
@ -1341,7 +1342,7 @@ impl PyTokenizer {
// - An iterator, to allow batching
// - A string
if let Ok(s) = element.downcast::<PyString>() {
itertools::Either::Right(std::iter::once(s.to_str().map(|s| s.to_owned())))
itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned())))
} else {
match element.iter() {
Ok(iter) => itertools::Either::Left(

View File

@ -313,8 +313,8 @@ impl PyBpeTrainer {
let mut builder = tk::models::bpe::BpeTrainer::builder();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => builder = builder.show_progress(val.extract()?),
@ -520,8 +520,8 @@ impl PyWordPieceTrainer {
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => builder = builder.show_progress(val.extract()?),
@ -661,8 +661,8 @@ impl PyWordLevelTrainer {
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"vocab_size" => {
builder.vocab_size(val.extract()?);
}
@ -828,8 +828,8 @@ impl PyUnigramTrainer {
let mut builder = tk::models::unigram::UnigramTrainer::builder();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"vocab_size" => builder.vocab_size(val.extract()?),
"show_progress" => builder.show_progress(val.extract()?),
"n_sub_iterations" => builder.n_sub_iterations(val.extract()?),

View File

@ -8,7 +8,7 @@ use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehav
use tk::pattern::Pattern;
/// Represents a Pattern as used by `NormalizedString`
#[derive(Clone, FromPyObject)]
#[derive(FromPyObject)]
pub enum PyPattern {
#[pyo3(annotation = "str")]
Str(String),
@ -95,9 +95,9 @@ pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior);
impl FromPyObject<'_> for PySplitDelimiterBehavior {
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
let s = obj.extract::<&str>()?;
let s = obj.extract::<String>()?;
Ok(Self(match s {
Ok(Self(match s.as_ref() {
"removed" => Ok(SplitDelimiterBehavior::Removed),
"isolated" => Ok(SplitDelimiterBehavior::Isolated),
"merged_with_previous" => Ok(SplitDelimiterBehavior::MergedWithPrevious),

View File

@ -70,9 +70,9 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResul
pub struct PyOffsetReferential(OffsetReferential);
impl FromPyObject<'_> for PyOffsetReferential {
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
let s = obj.extract::<&str>()?;
let s = obj.extract::<String>()?;
Ok(Self(match s {
Ok(Self(match s.as_ref() {
"original" => Ok(OffsetReferential::Original),
"normalized" => Ok(OffsetReferential::Normalized),
_ => Err(exceptions::PyValueError::new_err(
@ -86,9 +86,9 @@ impl FromPyObject<'_> for PyOffsetReferential {
pub struct PyOffsetType(OffsetType);
impl FromPyObject<'_> for PyOffsetType {
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
let s = obj.extract::<&str>()?;
let s = obj.extract::<String>()?;
Ok(Self(match s {
Ok(Self(match s.as_ref() {
"byte" => Ok(OffsetType::Byte),
"char" => Ok(OffsetType::Char),
_ => Err(exceptions::PyValueError::new_err(