mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Testing ABI3 wheels to reduce number of wheels (#1674)
* Testing ABI3 wheels to reduce number of wheels * No need for py-clone anymore. * Upgrade python versions. * Remove those flakes. * Promoting new CI + Fixing secret.
This commit is contained in:
181
.github/workflows/CI.yml
vendored
Normal file
181
.github/workflows/CI.yml
vendored
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
# This file is autogenerated by maturin v1.7.4
|
||||||
|
# To update, run
|
||||||
|
#
|
||||||
|
# maturin generate-ci github -m bindings/python/Cargo.toml
|
||||||
|
#
|
||||||
|
name: CI
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
- master
|
||||||
|
tags:
|
||||||
|
- '*'
|
||||||
|
pull_request:
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
linux:
|
||||||
|
runs-on: ${{ matrix.platform.runner }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
platform:
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: x86_64
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: x86
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: aarch64
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: armv7
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: s390x
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: ppc64le
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
- name: Build wheels
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
target: ${{ matrix.platform.target }}
|
||||||
|
args: --release --out dist --manifest-path bindings/python/Cargo.toml
|
||||||
|
sccache: 'true'
|
||||||
|
manylinux: auto
|
||||||
|
- name: Upload wheels
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-linux-${{ matrix.platform.target }}
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
musllinux:
|
||||||
|
runs-on: ${{ matrix.platform.runner }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
platform:
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: x86_64
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: x86
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: aarch64
|
||||||
|
- runner: ubuntu-latest
|
||||||
|
target: armv7
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
- name: Build wheels
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
target: ${{ matrix.platform.target }}
|
||||||
|
args: --release --out dist --manifest-path bindings/python/Cargo.toml
|
||||||
|
sccache: 'true'
|
||||||
|
manylinux: musllinux_1_2
|
||||||
|
- name: Upload wheels
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-musllinux-${{ matrix.platform.target }}
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
windows:
|
||||||
|
runs-on: ${{ matrix.platform.runner }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
platform:
|
||||||
|
- runner: windows-latest
|
||||||
|
target: x64
|
||||||
|
- runner: windows-latest
|
||||||
|
target: x86
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
architecture: ${{ matrix.platform.target }}
|
||||||
|
- name: Build wheels
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
target: ${{ matrix.platform.target }}
|
||||||
|
args: --release --out dist --manifest-path bindings/python/Cargo.toml
|
||||||
|
sccache: 'true'
|
||||||
|
- name: Upload wheels
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-windows-${{ matrix.platform.target }}
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
macos:
|
||||||
|
runs-on: ${{ matrix.platform.runner }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
platform:
|
||||||
|
- runner: macos-12
|
||||||
|
target: x86_64
|
||||||
|
- runner: macos-14
|
||||||
|
target: aarch64
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: 3.x
|
||||||
|
- name: Build wheels
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
target: ${{ matrix.platform.target }}
|
||||||
|
args: --release --out dist --manifest-path bindings/python/Cargo.toml
|
||||||
|
sccache: 'true'
|
||||||
|
- name: Upload wheels
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-macos-${{ matrix.platform.target }}
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
sdist:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Build sdist
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
with:
|
||||||
|
command: sdist
|
||||||
|
args: --out dist --manifest-path bindings/python/Cargo.toml
|
||||||
|
- name: Upload sdist
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wheels-sdist
|
||||||
|
path: dist
|
||||||
|
|
||||||
|
release:
|
||||||
|
name: Release
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
if: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'workflow_dispatch' }}
|
||||||
|
needs: [linux, musllinux, windows, macos, sdist]
|
||||||
|
permissions:
|
||||||
|
# Use to sign the release artifacts
|
||||||
|
id-token: write
|
||||||
|
# Used to upload release artifacts
|
||||||
|
contents: write
|
||||||
|
# Used to generate artifact attestation
|
||||||
|
attestations: write
|
||||||
|
steps:
|
||||||
|
- uses: actions/download-artifact@v4
|
||||||
|
- name: Generate artifact attestation
|
||||||
|
uses: actions/attest-build-provenance@v1
|
||||||
|
with:
|
||||||
|
subject-path: 'wheels-*/*'
|
||||||
|
- name: Publish to PyPI
|
||||||
|
if: "startsWith(github.ref, 'refs/tags/')"
|
||||||
|
uses: PyO3/maturin-action@v1
|
||||||
|
env:
|
||||||
|
MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_TOKEN_DIST}}
|
||||||
|
with:
|
||||||
|
command: upload
|
||||||
|
args: --non-interactive --skip-existing wheels-*/*
|
11
.github/workflows/python-release.yml
vendored
11
.github/workflows/python-release.yml
vendored
@ -164,8 +164,9 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: ./bindings/python/dist
|
path: ./bindings/python/dist
|
||||||
merge-multiple: true
|
merge-multiple: true
|
||||||
- name: Upload to PyPi
|
# Temporary deactivation while testing abi3 CI
|
||||||
working-directory: ./bindings/python
|
# - name: Upload to PyPi
|
||||||
run: |
|
# working-directory: ./bindings/python
|
||||||
pip install twine
|
# run: |
|
||||||
twine upload dist/* -u __token__ -p "$PYPI_TOKEN"
|
# pip install twine
|
||||||
|
# twine upload dist/* -u __token__ -p "$PYPI_TOKEN"
|
||||||
|
4
.github/workflows/python.yml
vendored
4
.github/workflows/python.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
|||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python: ["3.7", "3.8", "3.9", "3.10"]
|
python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@ -72,7 +72,7 @@ jobs:
|
|||||||
- name: Install Python
|
- name: Install Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: 3.11
|
python-version: 3.13
|
||||||
architecture: "x64"
|
architecture: "x64"
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ serde = { version = "1.0", features = ["rc", "derive"] }
|
|||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
env_logger = "0.11"
|
env_logger = "0.11"
|
||||||
pyo3 = { version = "0.22", features = ["py-clone"] }
|
pyo3 = { version = "0.22", features = ["abi3", "abi3-py39"] }
|
||||||
numpy = "0.22"
|
numpy = "0.22"
|
||||||
ndarray = "0.15"
|
ndarray = "0.15"
|
||||||
itertools = "0.12"
|
itertools = "0.12"
|
||||||
@ -24,7 +24,7 @@ path = "../../tokenizers"
|
|||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
tempfile = "3.10"
|
tempfile = "3.10"
|
||||||
pyo3 = { version = "0.22", features = ["auto-initialize", "py-clone"] }
|
pyo3 = { version = "0.22", features = ["auto-initialize"] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
defaut = ["pyo3/extension-module"]
|
defaut = ["pyo3/extension-module"]
|
||||||
|
@ -488,7 +488,6 @@ impl PySequenceDecoder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct CustomDecoder {
|
pub(crate) struct CustomDecoder {
|
||||||
inner: PyObject,
|
inner: PyObject,
|
||||||
}
|
}
|
||||||
|
@ -399,11 +399,11 @@ impl PyEncoding {
|
|||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, value) in kwargs {
|
for (key, value) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"direction" => {
|
"direction" => {
|
||||||
let value: &str = value.extract()?;
|
let value: String = value.extract()?;
|
||||||
direction = match value {
|
direction = match value.as_ref() {
|
||||||
"left" => Ok(PaddingDirection::Left),
|
"left" => Ok(PaddingDirection::Left),
|
||||||
"right" => Ok(PaddingDirection::Right),
|
"right" => Ok(PaddingDirection::Right),
|
||||||
other => Err(PyError(format!(
|
other => Err(PyError(format!(
|
||||||
|
@ -276,8 +276,8 @@ impl PyBPE {
|
|||||||
) -> PyResult<(Self, PyModel)> {
|
) -> PyResult<(Self, PyModel)> {
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, value) in kwargs {
|
for (key, value) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"cache_capacity" => builder = builder.cache_capacity(value.extract()?),
|
"cache_capacity" => builder = builder.cache_capacity(value.extract()?),
|
||||||
"dropout" => {
|
"dropout" => {
|
||||||
if let Some(dropout) = value.extract()? {
|
if let Some(dropout) = value.extract()? {
|
||||||
@ -581,8 +581,8 @@ impl PyWordPiece {
|
|||||||
) -> PyResult<(Self, PyModel)> {
|
) -> PyResult<(Self, PyModel)> {
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"unk_token" => {
|
"unk_token" => {
|
||||||
builder = builder.unk_token(val.extract()?);
|
builder = builder.unk_token(val.extract()?);
|
||||||
}
|
}
|
||||||
|
@ -184,9 +184,8 @@ macro_rules! getter {
|
|||||||
let super_ = $self.as_ref();
|
let super_ = $self.as_ref();
|
||||||
if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer {
|
if let PyNormalizerTypeWrapper::Single(ref norm) = super_.normalizer {
|
||||||
let wrapper = norm.read().unwrap();
|
let wrapper = norm.read().unwrap();
|
||||||
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone()
|
if let PyNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (&*wrapper) {
|
||||||
{
|
o.$name.clone()
|
||||||
o.$name
|
|
||||||
} else {
|
} else {
|
||||||
unreachable!()
|
unreachable!()
|
||||||
}
|
}
|
||||||
@ -538,7 +537,7 @@ impl PyReplace {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug)]
|
||||||
pub(crate) struct CustomNormalizer {
|
pub(crate) struct CustomNormalizer {
|
||||||
inner: PyObject,
|
inner: PyObject,
|
||||||
}
|
}
|
||||||
@ -581,7 +580,7 @@ impl<'de> Deserialize<'de> for CustomNormalizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub(crate) enum PyNormalizerWrapper {
|
pub(crate) enum PyNormalizerWrapper {
|
||||||
Custom(CustomNormalizer),
|
Custom(CustomNormalizer),
|
||||||
|
@ -618,7 +618,6 @@ impl PyUnicodeScripts {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct CustomPreTokenizer {
|
pub(crate) struct CustomPreTokenizer {
|
||||||
inner: PyObject,
|
inner: PyObject,
|
||||||
}
|
}
|
||||||
@ -662,7 +661,7 @@ impl<'de> Deserialize<'de> for CustomPreTokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Deserialize)]
|
#[derive(Deserialize)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub(crate) enum PyPreTokenizerWrapper {
|
pub(crate) enum PyPreTokenizerWrapper {
|
||||||
Custom(CustomPreTokenizer),
|
Custom(CustomPreTokenizer),
|
||||||
|
@ -313,7 +313,7 @@ impl From<PyTemplate> for Template {
|
|||||||
|
|
||||||
impl FromPyObject<'_> for PyTemplate {
|
impl FromPyObject<'_> for PyTemplate {
|
||||||
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
|
fn extract_bound(ob: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||||
if let Ok(s) = ob.extract::<&str>() {
|
if let Ok(s) = ob.extract::<String>() {
|
||||||
Ok(Self(
|
Ok(Self(
|
||||||
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
s.try_into().map_err(exceptions::PyValueError::new_err)?,
|
||||||
))
|
))
|
||||||
|
@ -136,8 +136,8 @@ impl PyAddedToken {
|
|||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, value) in kwargs {
|
for (key, value) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"single_word" => token.single_word = Some(value.extract()?),
|
"single_word" => token.single_word = Some(value.extract()?),
|
||||||
"lstrip" => token.lstrip = Some(value.extract()?),
|
"lstrip" => token.lstrip = Some(value.extract()?),
|
||||||
"rstrip" => token.rstrip = Some(value.extract()?),
|
"rstrip" => token.rstrip = Some(value.extract()?),
|
||||||
@ -159,8 +159,8 @@ impl PyAddedToken {
|
|||||||
match state.downcast_bound::<PyDict>(py) {
|
match state.downcast_bound::<PyDict>(py) {
|
||||||
Ok(state) => {
|
Ok(state) => {
|
||||||
for (key, value) in state {
|
for (key, value) in state {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"content" => self.content = value.extract()?,
|
"content" => self.content = value.extract()?,
|
||||||
"single_word" => self.single_word = Some(value.extract()?),
|
"single_word" => self.single_word = Some(value.extract()?),
|
||||||
"lstrip" => self.lstrip = Some(value.extract()?),
|
"lstrip" => self.lstrip = Some(value.extract()?),
|
||||||
@ -287,7 +287,7 @@ impl FromPyObject<'_> for PyArrayUnicode {
|
|||||||
}
|
}
|
||||||
let arr = ob.as_ptr() as *mut npyffi::PyArrayObject;
|
let arr = ob.as_ptr() as *mut npyffi::PyArrayObject;
|
||||||
// SAFETY Getting all the metadata about the numpy array to check its sanity
|
// SAFETY Getting all the metadata about the numpy array to check its sanity
|
||||||
let (type_num, elsize, alignment, data, nd, flags) = unsafe {
|
let (type_num, elsize, _alignment, data, nd, flags) = unsafe {
|
||||||
let desc = (*arr).descr;
|
let desc = (*arr).descr;
|
||||||
(
|
(
|
||||||
(*desc).type_num,
|
(*desc).type_num,
|
||||||
@ -323,15 +323,16 @@ impl FromPyObject<'_> for PyArrayUnicode {
|
|||||||
let seq = (0..n_elem)
|
let seq = (0..n_elem)
|
||||||
.map(|i| {
|
.map(|i| {
|
||||||
let bytes = &all_bytes[i * elsize..(i + 1) * elsize];
|
let bytes = &all_bytes[i * elsize..(i + 1) * elsize];
|
||||||
let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
|
Ok(std::str::from_utf8(bytes)?.to_owned())
|
||||||
pyo3::ffi::PyUnicode_4BYTE_KIND as _,
|
// let unicode = pyo3::ffi::PyUnicode_FromKindAndData(
|
||||||
bytes.as_ptr() as *const _,
|
// pyo3::ffi::PyUnicode_4BYTE_KIND as _,
|
||||||
elsize as isize / alignment as isize,
|
// bytes.as_ptr() as *const _,
|
||||||
);
|
// elsize as isize / alignment as isize,
|
||||||
let py = ob.py();
|
// );
|
||||||
let obj = PyObject::from_owned_ptr(py, unicode);
|
// let py = ob.py();
|
||||||
let s = obj.downcast_bound::<PyString>(py)?;
|
// let obj = PyObject::from_owned_ptr(py, unicode);
|
||||||
Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
// let s = obj.downcast_bound::<PyString>(py)?;
|
||||||
|
// Ok(s.to_string_lossy().trim_matches(char::from(0)).to_owned())
|
||||||
})
|
})
|
||||||
.collect::<PyResult<Vec<_>>>()?;
|
.collect::<PyResult<Vec<_>>>()?;
|
||||||
|
|
||||||
@ -736,12 +737,12 @@ impl PyTokenizer {
|
|||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, value) in kwargs {
|
for (key, value) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"stride" => params.stride = value.extract()?,
|
"stride" => params.stride = value.extract()?,
|
||||||
"strategy" => {
|
"strategy" => {
|
||||||
let value: &str = value.extract()?;
|
let value: String = value.extract()?;
|
||||||
params.strategy = match value {
|
params.strategy = match value.as_ref() {
|
||||||
"longest_first" => Ok(TruncationStrategy::LongestFirst),
|
"longest_first" => Ok(TruncationStrategy::LongestFirst),
|
||||||
"only_first" => Ok(TruncationStrategy::OnlyFirst),
|
"only_first" => Ok(TruncationStrategy::OnlyFirst),
|
||||||
"only_second" => Ok(TruncationStrategy::OnlySecond),
|
"only_second" => Ok(TruncationStrategy::OnlySecond),
|
||||||
@ -754,8 +755,8 @@ impl PyTokenizer {
|
|||||||
}?
|
}?
|
||||||
}
|
}
|
||||||
"direction" => {
|
"direction" => {
|
||||||
let value: &str = value.extract()?;
|
let value: String = value.extract()?;
|
||||||
params.direction = match value {
|
params.direction = match value.as_ref() {
|
||||||
"left" => Ok(TruncationDirection::Left),
|
"left" => Ok(TruncationDirection::Left),
|
||||||
"right" => Ok(TruncationDirection::Right),
|
"right" => Ok(TruncationDirection::Right),
|
||||||
_ => Err(PyError(format!(
|
_ => Err(PyError(format!(
|
||||||
@ -838,11 +839,11 @@ impl PyTokenizer {
|
|||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, value) in kwargs {
|
for (key, value) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"direction" => {
|
"direction" => {
|
||||||
let value: &str = value.extract()?;
|
let value: String = value.extract()?;
|
||||||
params.direction = match value {
|
params.direction = match value.as_ref() {
|
||||||
"left" => Ok(PaddingDirection::Left),
|
"left" => Ok(PaddingDirection::Left),
|
||||||
"right" => Ok(PaddingDirection::Right),
|
"right" => Ok(PaddingDirection::Right),
|
||||||
other => Err(PyError(format!(
|
other => Err(PyError(format!(
|
||||||
@ -1341,7 +1342,7 @@ impl PyTokenizer {
|
|||||||
// - An iterator, to allow batching
|
// - An iterator, to allow batching
|
||||||
// - A string
|
// - A string
|
||||||
if let Ok(s) = element.downcast::<PyString>() {
|
if let Ok(s) = element.downcast::<PyString>() {
|
||||||
itertools::Either::Right(std::iter::once(s.to_str().map(|s| s.to_owned())))
|
itertools::Either::Right(std::iter::once(s.to_cow().map(|s| s.into_owned())))
|
||||||
} else {
|
} else {
|
||||||
match element.iter() {
|
match element.iter() {
|
||||||
Ok(iter) => itertools::Either::Left(
|
Ok(iter) => itertools::Either::Left(
|
||||||
|
@ -313,8 +313,8 @@ impl PyBpeTrainer {
|
|||||||
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
let mut builder = tk::models::bpe::BpeTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"vocab_size" => builder = builder.vocab_size(val.extract()?),
|
"vocab_size" => builder = builder.vocab_size(val.extract()?),
|
||||||
"min_frequency" => builder = builder.min_frequency(val.extract()?),
|
"min_frequency" => builder = builder.min_frequency(val.extract()?),
|
||||||
"show_progress" => builder = builder.show_progress(val.extract()?),
|
"show_progress" => builder = builder.show_progress(val.extract()?),
|
||||||
@ -520,8 +520,8 @@ impl PyWordPieceTrainer {
|
|||||||
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"vocab_size" => builder = builder.vocab_size(val.extract()?),
|
"vocab_size" => builder = builder.vocab_size(val.extract()?),
|
||||||
"min_frequency" => builder = builder.min_frequency(val.extract()?),
|
"min_frequency" => builder = builder.min_frequency(val.extract()?),
|
||||||
"show_progress" => builder = builder.show_progress(val.extract()?),
|
"show_progress" => builder = builder.show_progress(val.extract()?),
|
||||||
@ -661,8 +661,8 @@ impl PyWordLevelTrainer {
|
|||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"vocab_size" => {
|
"vocab_size" => {
|
||||||
builder.vocab_size(val.extract()?);
|
builder.vocab_size(val.extract()?);
|
||||||
}
|
}
|
||||||
@ -828,8 +828,8 @@ impl PyUnigramTrainer {
|
|||||||
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
let mut builder = tk::models::unigram::UnigramTrainer::builder();
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, val) in kwargs {
|
for (key, val) in kwargs {
|
||||||
let key: &str = key.extract()?;
|
let key: String = key.extract()?;
|
||||||
match key {
|
match key.as_ref() {
|
||||||
"vocab_size" => builder.vocab_size(val.extract()?),
|
"vocab_size" => builder.vocab_size(val.extract()?),
|
||||||
"show_progress" => builder.show_progress(val.extract()?),
|
"show_progress" => builder.show_progress(val.extract()?),
|
||||||
"n_sub_iterations" => builder.n_sub_iterations(val.extract()?),
|
"n_sub_iterations" => builder.n_sub_iterations(val.extract()?),
|
||||||
|
@ -8,7 +8,7 @@ use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehav
|
|||||||
use tk::pattern::Pattern;
|
use tk::pattern::Pattern;
|
||||||
|
|
||||||
/// Represents a Pattern as used by `NormalizedString`
|
/// Represents a Pattern as used by `NormalizedString`
|
||||||
#[derive(Clone, FromPyObject)]
|
#[derive(FromPyObject)]
|
||||||
pub enum PyPattern {
|
pub enum PyPattern {
|
||||||
#[pyo3(annotation = "str")]
|
#[pyo3(annotation = "str")]
|
||||||
Str(String),
|
Str(String),
|
||||||
@ -95,9 +95,9 @@ pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior);
|
|||||||
|
|
||||||
impl FromPyObject<'_> for PySplitDelimiterBehavior {
|
impl FromPyObject<'_> for PySplitDelimiterBehavior {
|
||||||
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||||
let s = obj.extract::<&str>()?;
|
let s = obj.extract::<String>()?;
|
||||||
|
|
||||||
Ok(Self(match s {
|
Ok(Self(match s.as_ref() {
|
||||||
"removed" => Ok(SplitDelimiterBehavior::Removed),
|
"removed" => Ok(SplitDelimiterBehavior::Removed),
|
||||||
"isolated" => Ok(SplitDelimiterBehavior::Isolated),
|
"isolated" => Ok(SplitDelimiterBehavior::Isolated),
|
||||||
"merged_with_previous" => Ok(SplitDelimiterBehavior::MergedWithPrevious),
|
"merged_with_previous" => Ok(SplitDelimiterBehavior::MergedWithPrevious),
|
||||||
|
@ -70,9 +70,9 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &Bound<'_, PyAny>) -> PyResul
|
|||||||
pub struct PyOffsetReferential(OffsetReferential);
|
pub struct PyOffsetReferential(OffsetReferential);
|
||||||
impl FromPyObject<'_> for PyOffsetReferential {
|
impl FromPyObject<'_> for PyOffsetReferential {
|
||||||
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||||
let s = obj.extract::<&str>()?;
|
let s = obj.extract::<String>()?;
|
||||||
|
|
||||||
Ok(Self(match s {
|
Ok(Self(match s.as_ref() {
|
||||||
"original" => Ok(OffsetReferential::Original),
|
"original" => Ok(OffsetReferential::Original),
|
||||||
"normalized" => Ok(OffsetReferential::Normalized),
|
"normalized" => Ok(OffsetReferential::Normalized),
|
||||||
_ => Err(exceptions::PyValueError::new_err(
|
_ => Err(exceptions::PyValueError::new_err(
|
||||||
@ -86,9 +86,9 @@ impl FromPyObject<'_> for PyOffsetReferential {
|
|||||||
pub struct PyOffsetType(OffsetType);
|
pub struct PyOffsetType(OffsetType);
|
||||||
impl FromPyObject<'_> for PyOffsetType {
|
impl FromPyObject<'_> for PyOffsetType {
|
||||||
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
fn extract_bound(obj: &Bound<'_, PyAny>) -> PyResult<Self> {
|
||||||
let s = obj.extract::<&str>()?;
|
let s = obj.extract::<String>()?;
|
||||||
|
|
||||||
Ok(Self(match s {
|
Ok(Self(match s.as_ref() {
|
||||||
"byte" => Ok(OffsetType::Byte),
|
"byte" => Ok(OffsetType::Byte),
|
||||||
"char" => Ok(OffsetType::Char),
|
"char" => Ok(OffsetType::Char),
|
||||||
_ => Err(exceptions::PyValueError::new_err(
|
_ => Err(exceptions::PyValueError::new_err(
|
||||||
|
Reference in New Issue
Block a user