diff --git a/.github/workflows/python-release-conda.yml b/.github/workflows/python-release-conda.yml index c6ef7449..81a8e369 100644 --- a/.github/workflows/python-release-conda.yml +++ b/.github/workflows/python-release-conda.yml @@ -14,10 +14,9 @@ jobs: strategy: matrix: os: [windows-latest, macos-latest] - python: ["3.7", "3.8", "3.9"] - # 3.10 Not yet available on Conda. - # python: ["3.7", "3.8", "3.9", "3.10"] - + python: ["3.7", "3.8", "3.9", "3.10"] + # 3.11 Not yet available on Conda. + # python: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - name: Checkout repository uses: actions/checkout@v2 diff --git a/.github/workflows/python-release.yml b/.github/workflows/python-release.yml index cff9da5d..b1024f4f 100644 --- a/.github/workflows/python-release.yml +++ b/.github/workflows/python-release.yml @@ -34,7 +34,7 @@ jobs: runs-on: windows-latest strategy: matrix: - python: ["3.7", "3.8", "3.9", "3.10"] + python: ["3.7", "3.8", "3.9", "3.10", "3.11"] steps: - name: Checkout repository uses: actions/checkout@v2 diff --git a/bindings/node/CHANGELOG.md b/bindings/node/CHANGELOG.md index 08f7f545..4afdcb0a 100644 --- a/bindings/node/CHANGELOG.md +++ b/bindings/node/CHANGELOG.md @@ -1,3 +1,7 @@ +## [0.13.2] + +- Python only chnages. + ## [0.13.1] - [#1072] Fixing Roberta type ids. diff --git a/bindings/node/native/Cargo.lock b/bindings/node/native/Cargo.lock index f68951ca..3f84935e 100644 --- a/bindings/node/native/Cargo.lock +++ b/bindings/node/native/Cargo.lock @@ -995,7 +995,7 @@ dependencies = [ [[package]] name = "node" -version = "0.8.0" +version = "0.13.2" dependencies = [ "neon", "neon-build", @@ -1668,7 +1668,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokenizers" -version = "0.12.1" +version = "0.13.2" dependencies = [ "aho-corasick", "cached-path", diff --git a/bindings/node/native/src/pre_tokenizers.rs b/bindings/node/native/src/pre_tokenizers.rs index 8b22078a..151e0de6 100644 --- a/bindings/node/native/src/pre_tokenizers.rs +++ b/bindings/node/native/src/pre_tokenizers.rs @@ -227,7 +227,7 @@ fn sequence(mut cx: FunctionContext) -> JsResult { match pretokenizer.downcast::().or_throw(&mut cx) { Ok(pretokenizer) => { let guard = cx.lock(); - let pretok = (*pretokenizer.borrow(&guard)).pretok.clone(); + let pretok = pretokenizer.borrow(&guard).pretok.clone(); if let Some(pretokenizer) = pretok { match pretokenizer { JsPreTokenizerWrapper::Sequence(seq) => sequence.extend(seq), diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 31099f0c..56c8af84 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.13.2] + +- [#1096] Python 3.11 support + ## [0.13.1] - [#1072] Fixing Roberta type ids. @@ -389,7 +393,7 @@ delimiter (Works like `.split(delimiter)`) - Fix a bug with the IDs associated with added tokens. - Fix a bug that was causing crashes in Python 3.5 - +[#1096]: https://github.com/huggingface/tokenizers/pull/1096 [#1072]: https://github.com/huggingface/tokenizers/pull/1072 [#956]: https://github.com/huggingface/tokenizers/pull/956 [#1008]: https://github.com/huggingface/tokenizers/pull/1008 diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 9c4754d5..b58220af 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -1720,7 +1720,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokenizers" -version = "0.13.1" +version = "0.13.2" dependencies = [ "aho-corasick", "cached-path", diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index d6b98bd7..86b1b036 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tokenizers-python" -version = "0.13.1" +version = "0.13.2" authors = ["Anthony MOI "] edition = "2021" diff --git a/bindings/python/build-wheels.sh b/bindings/python/build-wheels.sh index 3efe9107..dd615060 100755 --- a/bindings/python/build-wheels.sh +++ b/bindings/python/build-wheels.sh @@ -8,7 +8,7 @@ fi export PATH="$HOME/.cargo/bin:$PATH" -for PYBIN in /opt/python/cp{37,38,39,310}*/bin; do +for PYBIN in /opt/python/cp{37,38,39,310,311}*/bin; do export PYTHON_SYS_EXECUTABLE="$PYBIN/python" "${PYBIN}/pip" install -U setuptools-rust setuptools wheel diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index bb721775..a19d2acb 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.13.2.dev0" +__version__ = "0.13.2" from enum import Enum from typing import List, Tuple, Union diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 2d8b9566..b2df4600 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -9,7 +9,7 @@ extras["dev"] = extras["testing"] setup( name="tokenizers", - version="0.13.2.dev0", + version="0.13.2", description="Fast and Customizable Tokenizers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 1a7102c9..b6f8c031 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -353,7 +353,7 @@ impl PySequenceDecoder { } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, &[PyList::empty(py)]) + PyTuple::new(py, [PyList::empty(py)]) } } diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index 34f47486..956f865b 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -360,7 +360,7 @@ impl PySequence { } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, &[PyList::empty(py)]) + PyTuple::new(py, [PyList::empty(py)]) } fn __len__(&self) -> usize { diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 30c74c67..71f05d7e 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -355,7 +355,7 @@ impl PySplit { } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, &[" ", "removed"]) + PyTuple::new(py, [" ", "removed"]) } } @@ -387,7 +387,7 @@ impl PyCharDelimiterSplit { } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, &[" "]) + PyTuple::new(py, [" "]) } } @@ -450,7 +450,7 @@ impl PySequence { } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, &[PyList::empty(py)]) + PyTuple::new(py, [PyList::empty(py)]) } } diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 61335240..641fe229 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -167,7 +167,7 @@ impl PyBertProcessing { } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, &[("", 0), ("", 0)]) + PyTuple::new(py, [("", 0), ("", 0)]) } } @@ -218,7 +218,7 @@ impl PyRobertaProcessing { } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, &[("", 0), ("", 0)]) + PyTuple::new(py, [("", 0), ("", 0)]) } } @@ -441,7 +441,7 @@ impl PySequence { } fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple { - PyTuple::new(py, &[PyList::empty(py)]) + PyTuple::new(py, [PyList::empty(py)]) } } diff --git a/tokenizers/CHANGELOG.md b/tokenizers/CHANGELOG.md index d07dada4..a9329bc9 100644 --- a/tokenizers/CHANGELOG.md +++ b/tokenizers/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.13.2] + +- Python only changes + ## [0.13.1] - [#1072] Fixing Roberta type ids. diff --git a/tokenizers/src/normalizers/bert.rs b/tokenizers/src/normalizers/bert.rs index 75ce1639..a6b014d2 100644 --- a/tokenizers/src/normalizers/bert.rs +++ b/tokenizers/src/normalizers/bert.rs @@ -99,7 +99,7 @@ impl BertNormalizer { let mut new_chars: Vec<(char, isize)> = vec![]; normalized.for_each(|c| { if is_chinese_char(c) { - new_chars.extend(&[(' ', 0), (c, 1), (' ', 1)]); + new_chars.extend([(' ', 0), (c, 1), (' ', 1)]); } else { new_chars.push((c, 0)); } diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index b8468afc..2f21f161 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -135,7 +135,7 @@ impl PreTokenizer for ByteLevel { bytes .iter() .enumerate() - .map(|(i, b)| (BYTES_CHAR[b], if i > 0 { 1 } else { 0 })), + .map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))), ); } normalized.transform(transformations.into_iter(), 0); diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 7ce412e1..bfbb4e0f 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -167,10 +167,10 @@ impl AddedVocabulary { pub fn new() -> Self { let trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build::<_, &&[u8]>(&[]); + .build::<_, &&[u8]>([]); let normalized_trie = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) - .build::<_, &&[u8]>(&[]); + .build::<_, &&[u8]>([]); Self { added_tokens_map: HashMap::new(), added_tokens_map_r: HashMap::new(), diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 10089c24..ac16ce92 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -546,7 +546,7 @@ impl NormalizedString { let mut new_chars: Vec<(char, isize)> = vec![]; self.for_each(|c| { c.to_lowercase().enumerate().for_each(|(index, c)| { - new_chars.push((c, if index > 0 { 1 } else { 0 })); + new_chars.push((c, isize::from(index > 0))); }) }); self.transform(new_chars.into_iter(), 0); @@ -558,7 +558,7 @@ impl NormalizedString { let mut new_chars: Vec<(char, isize)> = vec![]; self.for_each(|c| { c.to_uppercase().enumerate().for_each(|(index, c)| { - new_chars.push((c, if index > 0 { 1 } else { 0 })); + new_chars.push((c, isize::from(index > 0))); }) }); self.transform(new_chars.into_iter(), 0);