Move to maturing mimicking move for safetensors. + Rewritten node bindings. (#1331)

* Move to maturing mimicking move for `safetensors`.

* Tmp.

* Fix sdist.

* Wat?

* Clippy 1.72

* Remove if.

* Conda sed.

* Fix doc check workflow.

* Moving to maturin AND removing http + openssl mess (smoothing transition
moving to `huggingface_hub`)

* Fix dep

* Black.

* New node bindings.

* Fix docs + node cache ?

* Yarn.

* Working dir.

* Extension module.

* Put back interpreter.

* Remove cache.

* New attempt

* Multi python.

* Remove FromPretrained.

* Remove traces of `fromPretrained`.

* Drop 3.12 for windows?

* Typo.

* Put back the default feature for ignoring links during simple test.

* Fix ?

* x86_64 -> x64.

* Remove warning for windows bindings.

* Excluse aarch.

* Include/exclude.

* Put back workflows in correct states.
This commit is contained in:
Nicolas Patry
2023-08-28 16:24:14 +02:00
committed by GitHub
parent f2952020d5
commit d2010d5165
155 changed files with 12988 additions and 16409 deletions

View File

@@ -1,6 +1,6 @@
[package]
name = "tokenizers-python"
version = "0.13.4"
version = "0.14.0-dev.0"
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
edition = "2021"
@@ -14,19 +14,21 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
serde_json = "1.0"
libc = "0.2"
env_logger = "0.7.1"
pyo3 = "0.19"
pyo3 = { version = "0.19" }
numpy = "0.19.0"
ndarray = "0.13"
onig = { version = "6.0", default-features = false }
itertools = "0.9"
[dependencies.tokenizers]
version = "*"
version = "0.14.0-dev.0"
path = "../../tokenizers"
default-features = false
features = ["onig"]
[dev-dependencies]
tempfile = "3.1"
pyo3 = { version = "0.19", features = ["auto-initialize"] }
[features]
default = ["pyo3/extension-module"]
defaut = ["pyo3/extension-module"]

View File

@@ -1,9 +0,0 @@
#!/bin/bash
set -ex
# Create a symlink for tokenizers-lib
ln -sf ../../tokenizers tokenizers-lib
# Modify cargo.toml to include this symlink
sed -i 's/\.\.\/\.\.\/tokenizers/\.\/tokenizers-lib/' Cargo.toml
# Build the source distribution
python setup.py sdist

View File

@@ -1,5 +1,3 @@
__version__ = "0.13.4.rc2"
from enum import Enum
from typing import List, Tuple, Union
@@ -91,6 +89,7 @@ from .tokenizers import (
pre_tokenizers,
processors,
trainers,
__version__,
)
from .implementations import (
BertWordPieceTokenizer,

View File

@@ -1,7 +1,54 @@
[project]
name = 'tokenizers'
requires-python = '>=3.7'
authors = [
{name = 'Nicolas Patry', email = 'patry.nicolas@protonmail.com'},
{name = 'Anthony Moi', email = 'anthony@huggingface.co'}
]
classifiers = [
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
keywords = ["NLP", "tokenizer", "BPE", "transformer", "deep learning"]
dynamic = [
'description',
'license',
'readme',
]
dependencies = ["huggingface_hub>=0.16.4,<0.17"]
[project.urls]
Homepage = 'https://github.com/huggingface/tokenizers'
Source = 'https://github.com/huggingface/tokenizers'
[project.optional-dependencies]
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
dev = ["tokenizers[testing]"]
[build-system]
requires = ["setuptools", "wheel", "setuptools-rust"]
build-backend = "setuptools.build_meta"
requires = ["maturin>=1.0,<2.0"]
build-backend = "maturin"
[tool.maturin]
python-source = "py_src"
module-name = "tokenizers.tokenizers"
bindings = 'pyo3'
features = ["pyo3/extension-module"]
[tool.black]
target-version = ['py35']
line-length = 119
target-version = ['py35']

View File

@@ -1,66 +0,0 @@
from setuptools import setup
from setuptools_rust import Binding, RustExtension
extras = {}
extras["testing"] = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
extras["docs"] = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
extras["dev"] = extras["testing"]
with open("py_src/tokenizers/__init__.py", "r") as f:
version = f.readline().split("=")[-1].strip().strip('"')
setup(
name="tokenizers",
version=version,
description="Fast and Customizable Tokenizers",
long_description=open("README.md", "r", encoding="utf-8").read(),
long_description_content_type="text/markdown",
keywords="NLP tokenizer BPE transformer deep learning",
author="Anthony MOI",
author_email="anthony@huggingface.co",
url="https://github.com/huggingface/tokenizers",
license="Apache License 2.0",
rust_extensions=[RustExtension("tokenizers.tokenizers", binding=Binding.PyO3, debug=False)],
extras_require=extras,
classifiers=[
"Development Status :: 5 - Production/Stable",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
package_dir={"": "py_src"},
packages=[
"tokenizers",
"tokenizers.models",
"tokenizers.decoders",
"tokenizers.normalizers",
"tokenizers.pre_tokenizers",
"tokenizers.processors",
"tokenizers.trainers",
"tokenizers.implementations",
"tokenizers.tools",
],
package_data={
"tokenizers": ["py.typed", "__init__.pyi"],
"tokenizers.models": ["py.typed", "__init__.pyi"],
"tokenizers.decoders": ["py.typed", "__init__.pyi"],
"tokenizers.normalizers": ["py.typed", "__init__.pyi"],
"tokenizers.pre_tokenizers": ["py.typed", "__init__.pyi"],
"tokenizers.processors": ["py.typed", "__init__.pyi"],
"tokenizers.trainers": ["py.typed", "__init__.pyi"],
"tokenizers.implementations": ["py.typed"],
"tokenizers.tools": ["py.typed", "visualizer-styles.css"],
},
zip_safe=False,
)

View File

@@ -25,7 +25,9 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
// For users using multiprocessing in python, it is quite easy to fork the process running
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
// we register a callback to be called in the event of a fork so that we can warn the user.
#[cfg(target_family = "unix")]
static mut REGISTERED_FORK_CALLBACK: bool = false;
#[cfg(target_family = "unix")]
extern "C" fn child_after_fork() {
use tk::parallelism::*;
if has_parallelism_been_used() && !is_parallelism_configured() {
@@ -70,5 +72,6 @@ pub fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_wrapped(wrap_pymodule!(processors::processors))?;
m.add_wrapped(wrap_pymodule!(normalizers::normalizers))?;
m.add_wrapped(wrap_pymodule!(trainers::trainers))?;
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
Ok(())
}

View File

@@ -4,6 +4,7 @@ use std::hash::{Hash, Hasher};
use numpy::{npyffi, PyArray1};
use pyo3::class::basic::CompareOp;
use pyo3::exceptions;
use pyo3::intern;
use pyo3::prelude::*;
use pyo3::types::*;
use pyo3::AsPyPointer;
@@ -566,17 +567,23 @@ impl PyTokenizer {
revision: String,
auth_token: Option<String>,
) -> PyResult<Self> {
let params = tk::FromPretrainedParameters {
revision,
auth_token,
user_agent: [("bindings", "Python"), ("version", crate::VERSION)]
.iter()
.map(|(k, v)| (k.to_string(), v.to_string()))
.collect(),
};
let path = Python::with_gil(|py| -> PyResult<String> {
let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?;
let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?;
let kwargs = [
(intern!(py, "repo_id"), identifier),
(intern!(py, "filename"), "tokenizer.json"),
(intern!(py, "revision"), &revision),
]
.into_py_dict(py);
if let Some(auth_token) = auth_token {
kwargs.set_item(intern!(py, "token"), auth_token)?;
}
let path: String = hf_hub_download.call((), Some(kwargs))?.extract()?;
Ok(path)
})?;
let tokenizer: PyResult<_> =
ToPyResult(Tokenizer::from_pretrained(identifier, Some(params))).into();
let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into();
Ok(Self::new(tokenizer?))
}