mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-31 04:29:21 +00:00
Add python 3.11 to manylinux buildwheels (#1096)
* Add python 3.11 to manylinux buildwheels * Fixing clippy. * Node clippy. * Python clippy. * Changelog + version number update. Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
7
.github/workflows/python-release-conda.yml
vendored
7
.github/workflows/python-release-conda.yml
vendored
@ -14,10 +14,9 @@ jobs:
|
|||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
os: [windows-latest, macos-latest]
|
os: [windows-latest, macos-latest]
|
||||||
python: ["3.7", "3.8", "3.9"]
|
python: ["3.7", "3.8", "3.9", "3.10"]
|
||||||
# 3.10 Not yet available on Conda.
|
# 3.11 Not yet available on Conda.
|
||||||
# python: ["3.7", "3.8", "3.9", "3.10"]
|
# python: ["3.7", "3.8", "3.9", "3.10", "3.11"]
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
|
2
.github/workflows/python-release.yml
vendored
2
.github/workflows/python-release.yml
vendored
@ -34,7 +34,7 @@ jobs:
|
|||||||
runs-on: windows-latest
|
runs-on: windows-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python: ["3.7", "3.8", "3.9", "3.10"]
|
python: ["3.7", "3.8", "3.9", "3.10", "3.11"]
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
uses: actions/checkout@v2
|
uses: actions/checkout@v2
|
||||||
|
@ -1,3 +1,7 @@
|
|||||||
|
## [0.13.2]
|
||||||
|
|
||||||
|
- Python only chnages.
|
||||||
|
|
||||||
## [0.13.1]
|
## [0.13.1]
|
||||||
|
|
||||||
- [#1072] Fixing Roberta type ids.
|
- [#1072] Fixing Roberta type ids.
|
||||||
|
4
bindings/node/native/Cargo.lock
generated
4
bindings/node/native/Cargo.lock
generated
@ -995,7 +995,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "node"
|
name = "node"
|
||||||
version = "0.8.0"
|
version = "0.13.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"neon",
|
"neon",
|
||||||
"neon-build",
|
"neon-build",
|
||||||
@ -1668,7 +1668,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.12.1"
|
version = "0.13.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
"aho-corasick",
|
||||||
"cached-path",
|
"cached-path",
|
||||||
|
@ -227,7 +227,7 @@ fn sequence(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
|||||||
match pretokenizer.downcast::<JsPreTokenizer>().or_throw(&mut cx) {
|
match pretokenizer.downcast::<JsPreTokenizer>().or_throw(&mut cx) {
|
||||||
Ok(pretokenizer) => {
|
Ok(pretokenizer) => {
|
||||||
let guard = cx.lock();
|
let guard = cx.lock();
|
||||||
let pretok = (*pretokenizer.borrow(&guard)).pretok.clone();
|
let pretok = pretokenizer.borrow(&guard).pretok.clone();
|
||||||
if let Some(pretokenizer) = pretok {
|
if let Some(pretokenizer) = pretok {
|
||||||
match pretokenizer {
|
match pretokenizer {
|
||||||
JsPreTokenizerWrapper::Sequence(seq) => sequence.extend(seq),
|
JsPreTokenizerWrapper::Sequence(seq) => sequence.extend(seq),
|
||||||
|
@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [0.13.2]
|
||||||
|
|
||||||
|
- [#1096] Python 3.11 support
|
||||||
|
|
||||||
## [0.13.1]
|
## [0.13.1]
|
||||||
|
|
||||||
- [#1072] Fixing Roberta type ids.
|
- [#1072] Fixing Roberta type ids.
|
||||||
@ -389,7 +393,7 @@ delimiter (Works like `.split(delimiter)`)
|
|||||||
- Fix a bug with the IDs associated with added tokens.
|
- Fix a bug with the IDs associated with added tokens.
|
||||||
- Fix a bug that was causing crashes in Python 3.5
|
- Fix a bug that was causing crashes in Python 3.5
|
||||||
|
|
||||||
|
[#1096]: https://github.com/huggingface/tokenizers/pull/1096
|
||||||
[#1072]: https://github.com/huggingface/tokenizers/pull/1072
|
[#1072]: https://github.com/huggingface/tokenizers/pull/1072
|
||||||
[#956]: https://github.com/huggingface/tokenizers/pull/956
|
[#956]: https://github.com/huggingface/tokenizers/pull/956
|
||||||
[#1008]: https://github.com/huggingface/tokenizers/pull/1008
|
[#1008]: https://github.com/huggingface/tokenizers/pull/1008
|
||||||
|
2
bindings/python/Cargo.lock
generated
2
bindings/python/Cargo.lock
generated
@ -1720,7 +1720,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.13.1"
|
version = "0.13.2"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"aho-corasick",
|
"aho-corasick",
|
||||||
"cached-path",
|
"cached-path",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tokenizers-python"
|
name = "tokenizers-python"
|
||||||
version = "0.13.1"
|
version = "0.13.2"
|
||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ fi
|
|||||||
|
|
||||||
export PATH="$HOME/.cargo/bin:$PATH"
|
export PATH="$HOME/.cargo/bin:$PATH"
|
||||||
|
|
||||||
for PYBIN in /opt/python/cp{37,38,39,310}*/bin; do
|
for PYBIN in /opt/python/cp{37,38,39,310,311}*/bin; do
|
||||||
export PYTHON_SYS_EXECUTABLE="$PYBIN/python"
|
export PYTHON_SYS_EXECUTABLE="$PYBIN/python"
|
||||||
|
|
||||||
"${PYBIN}/pip" install -U setuptools-rust setuptools wheel
|
"${PYBIN}/pip" install -U setuptools-rust setuptools wheel
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
__version__ = "0.13.2.dev0"
|
__version__ = "0.13.2"
|
||||||
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import List, Tuple, Union
|
from typing import List, Tuple, Union
|
||||||
|
@ -9,7 +9,7 @@ extras["dev"] = extras["testing"]
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="tokenizers",
|
name="tokenizers",
|
||||||
version="0.13.2.dev0",
|
version="0.13.2",
|
||||||
description="Fast and Customizable Tokenizers",
|
description="Fast and Customizable Tokenizers",
|
||||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
@ -353,7 +353,7 @@ impl PySequenceDecoder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||||
PyTuple::new(py, &[PyList::empty(py)])
|
PyTuple::new(py, [PyList::empty(py)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -360,7 +360,7 @@ impl PySequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||||
PyTuple::new(py, &[PyList::empty(py)])
|
PyTuple::new(py, [PyList::empty(py)])
|
||||||
}
|
}
|
||||||
|
|
||||||
fn __len__(&self) -> usize {
|
fn __len__(&self) -> usize {
|
||||||
|
@ -355,7 +355,7 @@ impl PySplit {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||||
PyTuple::new(py, &[" ", "removed"])
|
PyTuple::new(py, [" ", "removed"])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -387,7 +387,7 @@ impl PyCharDelimiterSplit {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||||
PyTuple::new(py, &[" "])
|
PyTuple::new(py, [" "])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -450,7 +450,7 @@ impl PySequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||||
PyTuple::new(py, &[PyList::empty(py)])
|
PyTuple::new(py, [PyList::empty(py)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -167,7 +167,7 @@ impl PyBertProcessing {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||||
PyTuple::new(py, &[("", 0), ("", 0)])
|
PyTuple::new(py, [("", 0), ("", 0)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -218,7 +218,7 @@ impl PyRobertaProcessing {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||||
PyTuple::new(py, &[("", 0), ("", 0)])
|
PyTuple::new(py, [("", 0), ("", 0)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -441,7 +441,7 @@ impl PySequence {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||||
PyTuple::new(py, &[PyList::empty(py)])
|
PyTuple::new(py, [PyList::empty(py)])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [0.13.2]
|
||||||
|
|
||||||
|
- Python only changes
|
||||||
|
|
||||||
## [0.13.1]
|
## [0.13.1]
|
||||||
|
|
||||||
- [#1072] Fixing Roberta type ids.
|
- [#1072] Fixing Roberta type ids.
|
||||||
|
@ -99,7 +99,7 @@ impl BertNormalizer {
|
|||||||
let mut new_chars: Vec<(char, isize)> = vec![];
|
let mut new_chars: Vec<(char, isize)> = vec![];
|
||||||
normalized.for_each(|c| {
|
normalized.for_each(|c| {
|
||||||
if is_chinese_char(c) {
|
if is_chinese_char(c) {
|
||||||
new_chars.extend(&[(' ', 0), (c, 1), (' ', 1)]);
|
new_chars.extend([(' ', 0), (c, 1), (' ', 1)]);
|
||||||
} else {
|
} else {
|
||||||
new_chars.push((c, 0));
|
new_chars.push((c, 0));
|
||||||
}
|
}
|
||||||
|
@ -135,7 +135,7 @@ impl PreTokenizer for ByteLevel {
|
|||||||
bytes
|
bytes
|
||||||
.iter()
|
.iter()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(i, b)| (BYTES_CHAR[b], if i > 0 { 1 } else { 0 })),
|
.map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
normalized.transform(transformations.into_iter(), 0);
|
normalized.transform(transformations.into_iter(), 0);
|
||||||
|
@ -167,10 +167,10 @@ impl AddedVocabulary {
|
|||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
let trie = AhoCorasickBuilder::new()
|
let trie = AhoCorasickBuilder::new()
|
||||||
.match_kind(MatchKind::LeftmostLongest)
|
.match_kind(MatchKind::LeftmostLongest)
|
||||||
.build::<_, &&[u8]>(&[]);
|
.build::<_, &&[u8]>([]);
|
||||||
let normalized_trie = AhoCorasickBuilder::new()
|
let normalized_trie = AhoCorasickBuilder::new()
|
||||||
.match_kind(MatchKind::LeftmostLongest)
|
.match_kind(MatchKind::LeftmostLongest)
|
||||||
.build::<_, &&[u8]>(&[]);
|
.build::<_, &&[u8]>([]);
|
||||||
Self {
|
Self {
|
||||||
added_tokens_map: HashMap::new(),
|
added_tokens_map: HashMap::new(),
|
||||||
added_tokens_map_r: HashMap::new(),
|
added_tokens_map_r: HashMap::new(),
|
||||||
|
@ -546,7 +546,7 @@ impl NormalizedString {
|
|||||||
let mut new_chars: Vec<(char, isize)> = vec![];
|
let mut new_chars: Vec<(char, isize)> = vec![];
|
||||||
self.for_each(|c| {
|
self.for_each(|c| {
|
||||||
c.to_lowercase().enumerate().for_each(|(index, c)| {
|
c.to_lowercase().enumerate().for_each(|(index, c)| {
|
||||||
new_chars.push((c, if index > 0 { 1 } else { 0 }));
|
new_chars.push((c, isize::from(index > 0)));
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
self.transform(new_chars.into_iter(), 0);
|
self.transform(new_chars.into_iter(), 0);
|
||||||
@ -558,7 +558,7 @@ impl NormalizedString {
|
|||||||
let mut new_chars: Vec<(char, isize)> = vec![];
|
let mut new_chars: Vec<(char, isize)> = vec![];
|
||||||
self.for_each(|c| {
|
self.for_each(|c| {
|
||||||
c.to_uppercase().enumerate().for_each(|(index, c)| {
|
c.to_uppercase().enumerate().for_each(|(index, c)| {
|
||||||
new_chars.push((c, if index > 0 { 1 } else { 0 }));
|
new_chars.push((c, isize::from(index > 0)));
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
self.transform(new_chars.into_iter(), 0);
|
self.transform(new_chars.into_iter(), 0);
|
||||||
|
Reference in New Issue
Block a user