Add python 3.11 to manylinux buildwheels (#1096)

* Add python 3.11 to manylinux buildwheels * Fixing clippy. * Node clippy. * Python clippy. * Changelog + version number update. Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2025-12-03 11:18:29 +00:00 · 2022-11-07 17:45:04 +10:00
parent 96a9e5715c
commit 11bb2e00f2
20 changed files with 39 additions and 28 deletions
--- a/.github/workflows/python-release-conda.yml
+++ b/.github/workflows/python-release-conda.yml
@@ -14,10 +14,9 @@ jobs:
    strategy:
      matrix:
        os: [windows-latest, macos-latest]
-        python: ["3.7", "3.8", "3.9"]
-        # 3.10 Not yet available on Conda.
-        # python: ["3.7", "3.8", "3.9", "3.10"]
-
+        python: ["3.7", "3.8", "3.9", "3.10"]
+        # 3.11 Not yet available on Conda.
+        # python: ["3.7", "3.8", "3.9", "3.10", "3.11"]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2
--- a/.github/workflows/python-release.yml
+++ b/.github/workflows/python-release.yml
@@ -34,7 +34,7 @@ jobs:
    runs-on: windows-latest
    strategy:
      matrix:
-        python: ["3.7", "3.8", "3.9", "3.10"]
+        python: ["3.7", "3.8", "3.9", "3.10", "3.11"]
    steps:
      - name: Checkout repository
        uses: actions/checkout@v2
--- a/bindings/node/CHANGELOG.md
+++ b/bindings/node/CHANGELOG.md
@@ -1,3 +1,7 @@
+## [0.13.2] 
+
+- Python only chnages.
+
 ## [0.13.1] 

 - [#1072] Fixing Roberta type ids.
--- a/bindings/node/native/Cargo.lock
+++ b/bindings/node/native/Cargo.lock
@@ -995,7 +995,7 @@ dependencies = [

 [[package]]
 name = "node"
-version = "0.8.0"
+version = "0.13.2"
 dependencies = [
 "neon",
 "neon-build",
@@ -1668,7 +1668,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"

 [[package]]
 name = "tokenizers"
-version = "0.12.1"
+version = "0.13.2"
 dependencies = [
 "aho-corasick",
 "cached-path",
--- a/bindings/node/native/src/pre_tokenizers.rs
+++ b/bindings/node/native/src/pre_tokenizers.rs
@@ -227,7 +227,7 @@ fn sequence(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
        match pretokenizer.downcast::<JsPreTokenizer>().or_throw(&mut cx) {
            Ok(pretokenizer) => {
                let guard = cx.lock();
-                let pretok = (*pretokenizer.borrow(&guard)).pretok.clone();
+                let pretok = pretokenizer.borrow(&guard).pretok.clone();
                if let Some(pretokenizer) = pretok {
                    match pretokenizer {
                        JsPreTokenizerWrapper::Sequence(seq) => sequence.extend(seq),
--- a/bindings/python/CHANGELOG.md
+++ b/bindings/python/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.13.2] 
+
+- [#1096] Python 3.11 support
+
 ## [0.13.1] 

 - [#1072] Fixing Roberta type ids.
@@ -389,7 +393,7 @@ delimiter (Works like `.split(delimiter)`)
 - Fix a bug with the IDs associated with added tokens.
 - Fix a bug that was causing crashes in Python 3.5

-
+[#1096]: https://github.com/huggingface/tokenizers/pull/1096
 [#1072]: https://github.com/huggingface/tokenizers/pull/1072
 [#956]: https://github.com/huggingface/tokenizers/pull/956
 [#1008]: https://github.com/huggingface/tokenizers/pull/1008
--- a/bindings/python/Cargo.lock
+++ b/bindings/python/Cargo.lock
@@ -1720,7 +1720,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"

 [[package]]
 name = "tokenizers"
-version = "0.13.1"
+version = "0.13.2"
 dependencies = [
 "aho-corasick",
 "cached-path",
--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tokenizers-python"
-version = "0.13.1"
+version = "0.13.2"
 authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
 edition = "2021"

--- a/bindings/python/build-wheels.sh
+++ b/bindings/python/build-wheels.sh
@@ -8,7 +8,7 @@ fi

 export PATH="$HOME/.cargo/bin:$PATH"

-for PYBIN in /opt/python/cp{37,38,39,310}*/bin; do
+for PYBIN in /opt/python/cp{37,38,39,310,311}*/bin; do
    export PYTHON_SYS_EXECUTABLE="$PYBIN/python"

    "${PYBIN}/pip" install -U setuptools-rust setuptools wheel
--- a/bindings/python/py_src/tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/init.py
@@ -1,4 +1,4 @@
-__version__ = "0.13.2.dev0"
+__version__ = "0.13.2"

 from enum import Enum
 from typing import List, Tuple, Union
--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@@ -9,7 +9,7 @@ extras["dev"] = extras["testing"]

 setup(
    name="tokenizers",
-    version="0.13.2.dev0",
+    version="0.13.2",
    description="Fast and Customizable Tokenizers",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -353,7 +353,7 @@ impl PySequenceDecoder {
    }

    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
-        PyTuple::new(py, &[PyList::empty(py)])
+        PyTuple::new(py, [PyList::empty(py)])
    }
 }

--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -360,7 +360,7 @@ impl PySequence {
    }

    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
-        PyTuple::new(py, &[PyList::empty(py)])
+        PyTuple::new(py, [PyList::empty(py)])
    }

    fn __len__(&self) -> usize {
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -355,7 +355,7 @@ impl PySplit {
    }

    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
-        PyTuple::new(py, &[" ", "removed"])
+        PyTuple::new(py, [" ", "removed"])
    }
 }

@@ -387,7 +387,7 @@ impl PyCharDelimiterSplit {
    }

    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
-        PyTuple::new(py, &[" "])
+        PyTuple::new(py, [" "])
    }
 }

@@ -450,7 +450,7 @@ impl PySequence {
    }

    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
-        PyTuple::new(py, &[PyList::empty(py)])
+        PyTuple::new(py, [PyList::empty(py)])
    }
 }

--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -167,7 +167,7 @@ impl PyBertProcessing {
    }

    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
-        PyTuple::new(py, &[("", 0), ("", 0)])
+        PyTuple::new(py, [("", 0), ("", 0)])
    }
 }

@@ -218,7 +218,7 @@ impl PyRobertaProcessing {
    }

    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
-        PyTuple::new(py, &[("", 0), ("", 0)])
+        PyTuple::new(py, [("", 0), ("", 0)])
    }
 }

@@ -441,7 +441,7 @@ impl PySequence {
    }

    fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
-        PyTuple::new(py, &[PyList::empty(py)])
+        PyTuple::new(py, [PyList::empty(py)])
    }
 }

--- a/tokenizers/CHANGELOG.md
+++ b/tokenizers/CHANGELOG.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [0.13.2] 
+
+- Python only changes
+
 ## [0.13.1] 

 - [#1072] Fixing Roberta type ids.
--- a/tokenizers/src/normalizers/bert.rs
+++ b/tokenizers/src/normalizers/bert.rs
@@ -99,7 +99,7 @@ impl BertNormalizer {
        let mut new_chars: Vec<(char, isize)> = vec![];
        normalized.for_each(|c| {
            if is_chinese_char(c) {
-                new_chars.extend(&[(' ', 0), (c, 1), (' ', 1)]);
+                new_chars.extend([(' ', 0), (c, 1), (' ', 1)]);
            } else {
                new_chars.push((c, 0));
            }
--- a/tokenizers/src/pre_tokenizers/byte_level.rs
+++ b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -135,7 +135,7 @@ impl PreTokenizer for ByteLevel {
                    bytes
                        .iter()
                        .enumerate()
-                        .map(|(i, b)| (BYTES_CHAR[b], if i > 0 { 1 } else { 0 })),
+                        .map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))),
                );
            }
            normalized.transform(transformations.into_iter(), 0);
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -167,10 +167,10 @@ impl AddedVocabulary {
    pub fn new() -> Self {
        let trie = AhoCorasickBuilder::new()
            .match_kind(MatchKind::LeftmostLongest)
-            .build::<_, &&[u8]>(&[]);
+            .build::<_, &&[u8]>([]);
        let normalized_trie = AhoCorasickBuilder::new()
            .match_kind(MatchKind::LeftmostLongest)
-            .build::<_, &&[u8]>(&[]);
+            .build::<_, &&[u8]>([]);
        Self {
            added_tokens_map: HashMap::new(),
            added_tokens_map_r: HashMap::new(),
--- a/tokenizers/src/tokenizer/normalizer.rs
+++ b/tokenizers/src/tokenizer/normalizer.rs
@@ -546,7 +546,7 @@ impl NormalizedString {
        let mut new_chars: Vec<(char, isize)> = vec![];
        self.for_each(|c| {
            c.to_lowercase().enumerate().for_each(|(index, c)| {
-                new_chars.push((c, if index > 0 { 1 } else { 0 }));
+                new_chars.push((c, isize::from(index > 0)));
            })
        });
        self.transform(new_chars.into_iter(), 0);
@@ -558,7 +558,7 @@ impl NormalizedString {
        let mut new_chars: Vec<(char, isize)> = vec![];
        self.for_each(|c| {
            c.to_uppercase().enumerate().for_each(|(index, c)| {
-                new_chars.push((c, if index > 0 { 1 } else { 0 }));
+                new_chars.push((c, isize::from(index > 0)));
            })
        });
        self.transform(new_chars.into_iter(), 0);