Testing ABI3 wheels to reduce number of wheels (#1674)

* Testing ABI3 wheels to reduce number of wheels

* No need for py-clone  anymore.

* Upgrade python versions.

* Remove those flakes.

* Promoting new CI + Fixing secret.
This commit is contained in:
Nicolas Patry
2024-11-15 13:02:22 +08:00
committed by GitHub
parent 5aa9f6cff0
commit f4c9fd7f40
14 changed files with 246 additions and 66 deletions

181
.github/workflows/CI.yml vendored Normal file
View File

@ -0,0 +1,181 @@
# This file is autogenerated by maturin v1.7.4
# To update, run
#
# maturin generate-ci github -m bindings/python/Cargo.toml
#
name: CI
on:
push:
branches:
- main
- master
tags:
- '*'
pull_request:
workflow_dispatch:
permissions:
contents: read
jobs:
linux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-latest
target: x86_64
- runner: ubuntu-latest
target: x86
- runner: ubuntu-latest
target: aarch64
- runner: ubuntu-latest
target: armv7
- runner: ubuntu-latest
target: s390x
- runner: ubuntu-latest
target: ppc64le
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --manifest-path bindings/python/Cargo.toml
sccache: 'true'
manylinux: auto
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-linux-${{ matrix.platform.target }}
path: dist
musllinux:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: ubuntu-latest
target: x86_64
- runner: ubuntu-latest
target: x86
- runner: ubuntu-latest
target: aarch64
- runner: ubuntu-latest
target: armv7
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --manifest-path bindings/python/Cargo.toml
sccache: 'true'
manylinux: musllinux_1_2
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-musllinux-${{ matrix.platform.target }}
path: dist
windows:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: windows-latest
target: x64
- runner: windows-latest
target: x86
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
architecture: ${{ matrix.platform.target }}
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --manifest-path bindings/python/Cargo.toml
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-windows-${{ matrix.platform.target }}
path: dist
macos:
runs-on: ${{ matrix.platform.runner }}
strategy:
matrix:
platform:
- runner: macos-12
target: x86_64
- runner: macos-14
target: aarch64
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Build wheels
uses: PyO3/maturin-action@v1
with:
target: ${{ matrix.platform.target }}
args: --release --out dist --manifest-path bindings/python/Cargo.toml
sccache: 'true'
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: wheels-macos-${{ matrix.platform.target }}
path: dist
sdist:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build sdist
uses: PyO3/maturin-action@v1
with:
command: sdist
args: --out dist --manifest-path bindings/python/Cargo.toml
- name: Upload sdist
uses: actions/upload-artifact@v4
with:
name: wheels-sdist
path: dist
release:
name: Release
runs-on: ubuntu-latest
if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }}
needs: [linux, musllinux, windows, macos, sdist]
permissions:
# Use to sign the release artifacts
id-token: write
# Used to upload release artifacts
contents: write
# Used to generate artifact attestation
attestations: write
steps:
- uses: actions/download-artifact@v4
- name: Generate artifact attestation
uses: actions/attest-build-provenance@v1
with:
subject-path: 'wheels-*/*'
- name: Publish to PyPI
if: "startsWith(github.ref, 'refs/tags/')"
uses: PyO3/maturin-action@v1
env:
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN_DIST}}
with:
command: upload
args: --non-interactive --skip-existing wheels-*/*

View File

@ -164,8 +164,9 @@ jobs:
with:
path: ./bindings/python/dist
merge-multiple: true
- name: Upload to PyPi
working-directory: ./bindings/python
run: |
pip install twine
twine upload dist/* -u __token__ -p "$PYPI_TOKEN"
# Temporary deactivation while testing abi3 CI
# - name: Upload to PyPi
# working-directory: ./bindings/python
# run: |
# pip install twine
# twine upload dist/* -u __token__ -p "$PYPI_TOKEN"

View File

@ -16,7 +16,7 @@ jobs:
runs-on: windows-latest
strategy:
matrix:
python: ["3.7", "3.8", "3.9", "3.10"]
python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
steps:
- name: Checkout repository
uses: actions/checkout@v4
@ -72,7 +72,7 @@ jobs:
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: 3.11
python-version: 3.13
architecture: "x64"

View File

@ -14,7 +14,7 @@ serde = { version = "1.0", features = ["rc", "derive"] }
serde_json = "1.0"
libc = "0.2"
env_logger = "0.11"
pyo3 = { version = "0.22", features = ["py-clone"] }
pyo3 = { version = "0.22", features = ["abi3", "abi3-py39"] }
numpy = "0.22"
ndarray = "0.15"
itertools = "0.12"
@ -24,7 +24,7 @@ path = "../../tokenizers"
[dev-dependencies]
tempfile = "3.10"
pyo3 = { version = "0.22", features = ["auto-initialize", "py-clone"] }
pyo3 = { version = "0.22", features = ["auto-initialize"] }
[features]
defaut = ["pyo3/extension-module"]

View File

@ -488,7 +488,6 @@ impl PySequenceDecoder {
}
}
#[derive(Clone)]
pub(crate) struct CustomDecoder {
inner: PyObject,
}

View File

@ -399,11 +399,11 @@ impl PyEncoding {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"direction" => {
let value: &str = value.extract()?;
direction = match value {
let value: String = value.extract()?;
direction = match value.as_ref() {
"left" => Ok(PaddingDirection::Left),
"right" => Ok(PaddingDirection::Right),
other => Err(PyError(format!(

View File

@ -276,8 +276,8 @@ impl PyBPE {
) -> PyResult<(Self, PyModel)> {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"cache_capacity" => builder = builder.cache_capacity(value.extract()?),
"dropout" => {
if let Some(dropout) = value.extract()? {
@ -581,8 +581,8 @@ impl PyWordPiece {
) -> PyResult<(Self, PyModel)> {
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"unk_token" => {
builder = builder.unk_token(val.extract()?);
}

View File

@ -184,9 +184,8 @@ macro_rules! getter {
let super_ = $self.as_ref();
if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer {
let wrapper = norm.read().unwrap();
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone()
{
o.$name
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (&*wrapper) {
o.$name.clone()
} else {
unreachable!()
}
@ -538,7 +537,7 @@ impl PyReplace {
}
}
#[derive(Debug, Clone)]
#[derive(Debug)]
pub(crate) struct CustomNormalizer {
inner: PyObject,
}
@ -581,7 +580,7 @@ impl<'de> Deserialize<'de> for CustomNormalizer {
}
}
#[derive(Debug, Clone, Deserialize)]
#[derive(Debug, Deserialize)]
#[serde(untagged)]
pub(crate) enum PyNormalizerWrapper {
Custom(CustomNormalizer),

View File

@ -618,7 +618,6 @@ impl PyUnicodeScripts {
}
}
#[derive(Clone)]
pub(crate) struct CustomPreTokenizer {
inner: PyObject,
}
@ -662,7 +661,7 @@ impl<'de> Deserialize<'de> for CustomPreTokenizer {
}
}
#[derive(Clone, Deserialize)]
#[derive(Deserialize)]
#[serde(untagged)]
pub(crate) enum PyPreTokenizerWrapper {
Custom(CustomPreTokenizer),

View File

@ -313,7 +313,7 @@ impl From<PyTemplate> for Template {
impl FromPyObject<'_> for PyTemplate {
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
if let Ok(s) = ob.extract::<&str>() {
if let Ok(s) = ob.extract::<String>() {
Ok(Self(
s.try_into().map_err(exceptions::PyValueError::new_err)?,
))

View File

@ -136,8 +136,8 @@ impl PyAddedToken {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"single_word" => token.single_word = Some(value.extract()?),
"lstrip" => token.lstrip = Some(value.extract()?),
"rstrip" => token.rstrip = Some(value.extract()?),
@ -159,8 +159,8 @@ impl PyAddedToken {
match state.downcast_bound::<PyDict>(py) {
Ok(state) => {
for (key, value) in state {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"content" => self.content = value.extract()?,
"single_word" => self.single_word = Some(value.extract()?),
"lstrip" => self.lstrip = Some(value.extract()?),
@ -287,7 +287,7 @@ impl FromPyObject<'_> for PyArrayUnicode {
}
let arr = ob.as_ptr() as *mut npyffi::PyArrayObject;
// SAFETY Getting all the metadata about the numpy array to check its sanity
let (type_num, elsize, alignment, data, nd, flags) = unsafe {
let (type_num, elsize, _alignment, data, nd, flags) = unsafe {
let desc = (*arr).descr;
(
(*desc).type_num,
@ -323,15 +323,16 @@ impl FromPyObject<'_> for PyArrayUnicode {
let seq = (0..n_elem)
.map(|i| {
let bytes = &all_bytes[i * elsize..(i + 1) * elsize];
let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
pyo3::ffi::PyUnicode_4BYTE_KIND as _,
bytes.as_ptr() as *const _,
elsize as isize / alignment as isize,
);
let py = ob.py();
let obj = PyObject::from_owned_ptr(py, unicode);
let s = obj.downcast_bound::<PyString>(py)?;
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
Ok(std::str::from_utf8(bytes)?.to_owned())
// let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
// pyo3::ffi::PyUnicode_4BYTE_KIND as _,
// bytes.as_ptr() as *const _,
// elsize as isize / alignment as isize,
// );
// let py = ob.py();
// let obj = PyObject::from_owned_ptr(py, unicode);
// let s = obj.downcast_bound::<PyString>(py)?;
// Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
})
.collect::<PyResult<Vec<_>>>()?;
@ -736,12 +737,12 @@ impl PyTokenizer {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"stride" => params.stride = value.extract()?,
"strategy" => {
let value: &str = value.extract()?;
params.strategy = match value {
let value: String = value.extract()?;
params.strategy = match value.as_ref() {
"longest_first" => Ok(TruncationStrategy::LongestFirst),
"only_first" => Ok(TruncationStrategy::OnlyFirst),
"only_second" => Ok(TruncationStrategy::OnlySecond),
@ -754,8 +755,8 @@ impl PyTokenizer {
}?
}
"direction" => {
let value: &str = value.extract()?;
params.direction = match value {
let value: String = value.extract()?;
params.direction = match value.as_ref() {
"left" => Ok(TruncationDirection::Left),
"right" => Ok(TruncationDirection::Right),
_ => Err(PyError(format!(
@ -838,11 +839,11 @@ impl PyTokenizer {
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"direction" => {
let value: &str = value.extract()?;
params.direction = match value {
let value: String = value.extract()?;
params.direction = match value.as_ref() {
"left" => Ok(PaddingDirection::Left),
"right" => Ok(PaddingDirection::Right),
other => Err(PyError(format!(
@ -1341,7 +1342,7 @@ impl PyTokenizer {
// - An iterator, to allow batching
// - A string
if let Ok(s) = element.downcast::<PyString>() {
itertools::Either::Right(std::iter::once(s.to_str().map(|s| s.to_owned())))
itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned())))
} else {
match element.iter() {
Ok(iter) => itertools::Either::Left(

View File

@ -313,8 +313,8 @@ impl PyBpeTrainer {
let mut builder = tk::models::bpe::BpeTrainer::builder();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => builder = builder.show_progress(val.extract()?),
@ -520,8 +520,8 @@ impl PyWordPieceTrainer {
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => builder = builder.show_progress(val.extract()?),
@ -661,8 +661,8 @@ impl PyWordLevelTrainer {
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"vocab_size" => {
builder.vocab_size(val.extract()?);
}
@ -828,8 +828,8 @@ impl PyUnigramTrainer {
let mut builder = tk::models::unigram::UnigramTrainer::builder();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
let key: String = key.extract()?;
match key.as_ref() {
"vocab_size" => builder.vocab_size(val.extract()?),
"show_progress" => builder.show_progress(val.extract()?),
"n_sub_iterations" => builder.n_sub_iterations(val.extract()?),

View File

@ -8,7 +8,7 @@ use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehav
use tk::pattern::Pattern;
/// Represents a Pattern as used by `NormalizedString`
#[derive(Clone, FromPyObject)]
#[derive(FromPyObject)]
pub enum PyPattern {
#[pyo3(annotation = "str")]
Str(String),
@ -95,9 +95,9 @@ pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior);
impl FromPyObject<'_> for PySplitDelimiterBehavior {
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
let s = obj.extract::<&str>()?;
let s = obj.extract::<String>()?;
Ok(Self(match s {
Ok(Self(match s.as_ref() {
"removed" => Ok(SplitDelimiterBehavior::Removed),
"isolated" => Ok(SplitDelimiterBehavior::Isolated),
"merged_with_previous" => Ok(SplitDelimiterBehavior::MergedWithPrevious),

View File

@ -70,9 +70,9 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResul
pub struct PyOffsetReferential(OffsetReferential);
impl FromPyObject<'_> for PyOffsetReferential {
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
let s = obj.extract::<&str>()?;
let s = obj.extract::<String>()?;
Ok(Self(match s {
Ok(Self(match s.as_ref() {
"original" => Ok(OffsetReferential::Original),
"normalized" => Ok(OffsetReferential::Normalized),
_ => Err(exceptions::PyValueError::new_err(
@ -86,9 +86,9 @@ impl FromPyObject<'_> for PyOffsetReferential {
pub struct PyOffsetType(OffsetType);
impl FromPyObject<'_> for PyOffsetType {
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
let s = obj.extract::<&str>()?;
let s = obj.extract::<String>()?;
Ok(Self(match s {
Ok(Self(match s.as_ref() {
"byte" => Ok(OffsetType::Byte),
"char" => Ok(OffsetType::Char),
_ => Err(exceptions::PyValueError::new_err(