mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add python 3.11 to manylinux buildwheels (#1096)
* Add python 3.11 to manylinux buildwheels * Fixing clippy. * Node clippy. * Python clippy. * Changelog + version number update. Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
7
.github/workflows/python-release-conda.yml
vendored
7
.github/workflows/python-release-conda.yml
vendored
@ -14,10 +14,9 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [windows-latest, macos-latest]
|
||||
python: ["3.7", "3.8", "3.9"]
|
||||
# 3.10 Not yet available on Conda.
|
||||
# python: ["3.7", "3.8", "3.9", "3.10"]
|
||||
|
||||
python: ["3.7", "3.8", "3.9", "3.10"]
|
||||
# 3.11 Not yet available on Conda.
|
||||
# python: ["3.7", "3.8", "3.9", "3.10", "3.11"]
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
2
.github/workflows/python-release.yml
vendored
2
.github/workflows/python-release.yml
vendored
@ -34,7 +34,7 @@ jobs:
|
||||
runs-on: windows-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python: ["3.7", "3.8", "3.9", "3.10"]
|
||||
python: ["3.7", "3.8", "3.9", "3.10", "3.11"]
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
@ -1,3 +1,7 @@
|
||||
## [0.13.2]
|
||||
|
||||
- Python only chnages.
|
||||
|
||||
## [0.13.1]
|
||||
|
||||
- [#1072] Fixing Roberta type ids.
|
||||
|
4
bindings/node/native/Cargo.lock
generated
4
bindings/node/native/Cargo.lock
generated
@ -995,7 +995,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "node"
|
||||
version = "0.8.0"
|
||||
version = "0.13.2"
|
||||
dependencies = [
|
||||
"neon",
|
||||
"neon-build",
|
||||
@ -1668,7 +1668,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers"
|
||||
version = "0.12.1"
|
||||
version = "0.13.2"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"cached-path",
|
||||
|
@ -227,7 +227,7 @@ fn sequence(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
match pretokenizer.downcast::<JsPreTokenizer>().or_throw(&mut cx) {
|
||||
Ok(pretokenizer) => {
|
||||
let guard = cx.lock();
|
||||
let pretok = (*pretokenizer.borrow(&guard)).pretok.clone();
|
||||
let pretok = pretokenizer.borrow(&guard).pretok.clone();
|
||||
if let Some(pretokenizer) = pretok {
|
||||
match pretokenizer {
|
||||
JsPreTokenizerWrapper::Sequence(seq) => sequence.extend(seq),
|
||||
|
@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.13.2]
|
||||
|
||||
- [#1096] Python 3.11 support
|
||||
|
||||
## [0.13.1]
|
||||
|
||||
- [#1072] Fixing Roberta type ids.
|
||||
@ -389,7 +393,7 @@ delimiter (Works like `.split(delimiter)`)
|
||||
- Fix a bug with the IDs associated with added tokens.
|
||||
- Fix a bug that was causing crashes in Python 3.5
|
||||
|
||||
|
||||
[#1096]: https://github.com/huggingface/tokenizers/pull/1096
|
||||
[#1072]: https://github.com/huggingface/tokenizers/pull/1072
|
||||
[#956]: https://github.com/huggingface/tokenizers/pull/956
|
||||
[#1008]: https://github.com/huggingface/tokenizers/pull/1008
|
||||
|
2
bindings/python/Cargo.lock
generated
2
bindings/python/Cargo.lock
generated
@ -1720,7 +1720,7 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c"
|
||||
|
||||
[[package]]
|
||||
name = "tokenizers"
|
||||
version = "0.13.1"
|
||||
version = "0.13.2"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"cached-path",
|
||||
|
@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tokenizers-python"
|
||||
version = "0.13.1"
|
||||
version = "0.13.2"
|
||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||
edition = "2021"
|
||||
|
||||
|
@ -8,7 +8,7 @@ fi
|
||||
|
||||
export PATH="$HOME/.cargo/bin:$PATH"
|
||||
|
||||
for PYBIN in /opt/python/cp{37,38,39,310}*/bin; do
|
||||
for PYBIN in /opt/python/cp{37,38,39,310,311}*/bin; do
|
||||
export PYTHON_SYS_EXECUTABLE="$PYBIN/python"
|
||||
|
||||
"${PYBIN}/pip" install -U setuptools-rust setuptools wheel
|
||||
|
@ -1,4 +1,4 @@
|
||||
__version__ = "0.13.2.dev0"
|
||||
__version__ = "0.13.2"
|
||||
|
||||
from enum import Enum
|
||||
from typing import List, Tuple, Union
|
||||
|
@ -9,7 +9,7 @@ extras["dev"] = extras["testing"]
|
||||
|
||||
setup(
|
||||
name="tokenizers",
|
||||
version="0.13.2.dev0",
|
||||
version="0.13.2",
|
||||
description="Fast and Customizable Tokenizers",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
@ -353,7 +353,7 @@ impl PySequenceDecoder {
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[PyList::empty(py)])
|
||||
PyTuple::new(py, [PyList::empty(py)])
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -360,7 +360,7 @@ impl PySequence {
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[PyList::empty(py)])
|
||||
PyTuple::new(py, [PyList::empty(py)])
|
||||
}
|
||||
|
||||
fn __len__(&self) -> usize {
|
||||
|
@ -355,7 +355,7 @@ impl PySplit {
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[" ", "removed"])
|
||||
PyTuple::new(py, [" ", "removed"])
|
||||
}
|
||||
}
|
||||
|
||||
@ -387,7 +387,7 @@ impl PyCharDelimiterSplit {
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[" "])
|
||||
PyTuple::new(py, [" "])
|
||||
}
|
||||
}
|
||||
|
||||
@ -450,7 +450,7 @@ impl PySequence {
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[PyList::empty(py)])
|
||||
PyTuple::new(py, [PyList::empty(py)])
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -167,7 +167,7 @@ impl PyBertProcessing {
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[("", 0), ("", 0)])
|
||||
PyTuple::new(py, [("", 0), ("", 0)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -218,7 +218,7 @@ impl PyRobertaProcessing {
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[("", 0), ("", 0)])
|
||||
PyTuple::new(py, [("", 0), ("", 0)])
|
||||
}
|
||||
}
|
||||
|
||||
@ -441,7 +441,7 @@ impl PySequence {
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> &'p PyTuple {
|
||||
PyTuple::new(py, &[PyList::empty(py)])
|
||||
PyTuple::new(py, [PyList::empty(py)])
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [0.13.2]
|
||||
|
||||
- Python only changes
|
||||
|
||||
## [0.13.1]
|
||||
|
||||
- [#1072] Fixing Roberta type ids.
|
||||
|
@ -99,7 +99,7 @@ impl BertNormalizer {
|
||||
let mut new_chars: Vec<(char, isize)> = vec![];
|
||||
normalized.for_each(|c| {
|
||||
if is_chinese_char(c) {
|
||||
new_chars.extend(&[(' ', 0), (c, 1), (' ', 1)]);
|
||||
new_chars.extend([(' ', 0), (c, 1), (' ', 1)]);
|
||||
} else {
|
||||
new_chars.push((c, 0));
|
||||
}
|
||||
|
@ -135,7 +135,7 @@ impl PreTokenizer for ByteLevel {
|
||||
bytes
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, b)| (BYTES_CHAR[b], if i > 0 { 1 } else { 0 })),
|
||||
.map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))),
|
||||
);
|
||||
}
|
||||
normalized.transform(transformations.into_iter(), 0);
|
||||
|
@ -167,10 +167,10 @@ impl AddedVocabulary {
|
||||
pub fn new() -> Self {
|
||||
let trie = AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build::<_, &&[u8]>(&[]);
|
||||
.build::<_, &&[u8]>([]);
|
||||
let normalized_trie = AhoCorasickBuilder::new()
|
||||
.match_kind(MatchKind::LeftmostLongest)
|
||||
.build::<_, &&[u8]>(&[]);
|
||||
.build::<_, &&[u8]>([]);
|
||||
Self {
|
||||
added_tokens_map: HashMap::new(),
|
||||
added_tokens_map_r: HashMap::new(),
|
||||
|
@ -546,7 +546,7 @@ impl NormalizedString {
|
||||
let mut new_chars: Vec<(char, isize)> = vec![];
|
||||
self.for_each(|c| {
|
||||
c.to_lowercase().enumerate().for_each(|(index, c)| {
|
||||
new_chars.push((c, if index > 0 { 1 } else { 0 }));
|
||||
new_chars.push((c, isize::from(index > 0)));
|
||||
})
|
||||
});
|
||||
self.transform(new_chars.into_iter(), 0);
|
||||
@ -558,7 +558,7 @@ impl NormalizedString {
|
||||
let mut new_chars: Vec<(char, isize)> = vec![];
|
||||
self.for_each(|c| {
|
||||
c.to_uppercase().enumerate().for_each(|(index, c)| {
|
||||
new_chars.push((c, if index > 0 { 1 } else { 0 }));
|
||||
new_chars.push((c, isize::from(index > 0)));
|
||||
})
|
||||
});
|
||||
self.transform(new_chars.into_iter(), 0);
|
||||
|
Reference in New Issue
Block a user