mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 11:18:29 +00:00
Move to maturing mimicking move for safetensors. + Rewritten node bindings. (#1331)
* Move to maturing mimicking move for `safetensors`. * Tmp. * Fix sdist. * Wat? * Clippy 1.72 * Remove if. * Conda sed. * Fix doc check workflow. * Moving to maturin AND removing http + openssl mess (smoothing transition moving to `huggingface_hub`) * Fix dep * Black. * New node bindings. * Fix docs + node cache ? * Yarn. * Working dir. * Extension module. * Put back interpreter. * Remove cache. * New attempt * Multi python. * Remove FromPretrained. * Remove traces of `fromPretrained`. * Drop 3.12 for windows? * Typo. * Put back the default feature for ignoring links during simple test. * Fix ? * x86_64 -> x64. * Remove warning for windows bindings. * Excluse aarch. * Include/exclude. * Put back workflows in correct states.
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tokenizers-python"
|
||||
version = "0.13.4"
|
||||
version = "0.14.0-dev.0"
|
||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||
edition = "2021"
|
||||
|
||||
@@ -14,19 +14,21 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
|
||||
serde_json = "1.0"
|
||||
libc = "0.2"
|
||||
env_logger = "0.7.1"
|
||||
pyo3 = "0.19"
|
||||
pyo3 = { version = "0.19" }
|
||||
numpy = "0.19.0"
|
||||
ndarray = "0.13"
|
||||
onig = { version = "6.0", default-features = false }
|
||||
itertools = "0.9"
|
||||
|
||||
[dependencies.tokenizers]
|
||||
version = "*"
|
||||
version = "0.14.0-dev.0"
|
||||
path = "../../tokenizers"
|
||||
default-features = false
|
||||
features = ["onig"]
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.1"
|
||||
pyo3 = { version = "0.19", features = ["auto-initialize"] }
|
||||
|
||||
[features]
|
||||
default = ["pyo3/extension-module"]
|
||||
defaut = ["pyo3/extension-module"]
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# Create a symlink for tokenizers-lib
|
||||
ln -sf ../../tokenizers tokenizers-lib
|
||||
# Modify cargo.toml to include this symlink
|
||||
sed -i 's/\.\.\/\.\.\/tokenizers/\.\/tokenizers-lib/' Cargo.toml
|
||||
# Build the source distribution
|
||||
python setup.py sdist
|
||||
@@ -1,5 +1,3 @@
|
||||
__version__ = "0.13.4.rc2"
|
||||
|
||||
from enum import Enum
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
@@ -91,6 +89,7 @@ from .tokenizers import (
|
||||
pre_tokenizers,
|
||||
processors,
|
||||
trainers,
|
||||
__version__,
|
||||
)
|
||||
from .implementations import (
|
||||
BertWordPieceTokenizer,
|
||||
|
||||
@@ -1,7 +1,54 @@
|
||||
[project]
|
||||
name = 'tokenizers'
|
||||
requires-python = '>=3.7'
|
||||
authors = [
|
||||
{name = 'Nicolas Patry', email = 'patry.nicolas@protonmail.com'},
|
||||
{name = 'Anthony Moi', email = 'anthony@huggingface.co'}
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Education",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
]
|
||||
keywords = ["NLP", "tokenizer", "BPE", "transformer", "deep learning"]
|
||||
dynamic = [
|
||||
'description',
|
||||
'license',
|
||||
'readme',
|
||||
]
|
||||
dependencies = ["huggingface_hub>=0.16.4,<0.17"]
|
||||
|
||||
[project.urls]
|
||||
Homepage = 'https://github.com/huggingface/tokenizers'
|
||||
Source = 'https://github.com/huggingface/tokenizers'
|
||||
|
||||
|
||||
[project.optional-dependencies]
|
||||
testing = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
|
||||
docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
|
||||
dev = ["tokenizers[testing]"]
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel", "setuptools-rust"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
requires = ["maturin>=1.0,<2.0"]
|
||||
build-backend = "maturin"
|
||||
|
||||
[tool.maturin]
|
||||
python-source = "py_src"
|
||||
module-name = "tokenizers.tokenizers"
|
||||
bindings = 'pyo3'
|
||||
features = ["pyo3/extension-module"]
|
||||
|
||||
[tool.black]
|
||||
target-version = ['py35']
|
||||
line-length = 119
|
||||
target-version = ['py35']
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
from setuptools import setup
|
||||
from setuptools_rust import Binding, RustExtension
|
||||
|
||||
|
||||
extras = {}
|
||||
extras["testing"] = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
|
||||
extras["docs"] = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
|
||||
extras["dev"] = extras["testing"]
|
||||
|
||||
with open("py_src/tokenizers/__init__.py", "r") as f:
|
||||
version = f.readline().split("=")[-1].strip().strip('"')
|
||||
|
||||
setup(
|
||||
name="tokenizers",
|
||||
version=version,
|
||||
description="Fast and Customizable Tokenizers",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
keywords="NLP tokenizer BPE transformer deep learning",
|
||||
author="Anthony MOI",
|
||||
author_email="anthony@huggingface.co",
|
||||
url="https://github.com/huggingface/tokenizers",
|
||||
license="Apache License 2.0",
|
||||
rust_extensions=[RustExtension("tokenizers.tokenizers", binding=Binding.PyO3, debug=False)],
|
||||
extras_require=extras,
|
||||
classifiers=[
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Intended Audience :: Developers",
|
||||
"Intended Audience :: Education",
|
||||
"Intended Audience :: Science/Research",
|
||||
"License :: OSI Approved :: Apache Software License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3.5",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
],
|
||||
package_dir={"": "py_src"},
|
||||
packages=[
|
||||
"tokenizers",
|
||||
"tokenizers.models",
|
||||
"tokenizers.decoders",
|
||||
"tokenizers.normalizers",
|
||||
"tokenizers.pre_tokenizers",
|
||||
"tokenizers.processors",
|
||||
"tokenizers.trainers",
|
||||
"tokenizers.implementations",
|
||||
"tokenizers.tools",
|
||||
],
|
||||
package_data={
|
||||
"tokenizers": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.models": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.decoders": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.normalizers": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.pre_tokenizers": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.processors": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.trainers": ["py.typed", "__init__.pyi"],
|
||||
"tokenizers.implementations": ["py.typed"],
|
||||
"tokenizers.tools": ["py.typed", "visualizer-styles.css"],
|
||||
},
|
||||
zip_safe=False,
|
||||
)
|
||||
@@ -25,7 +25,9 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
// For users using multiprocessing in python, it is quite easy to fork the process running
|
||||
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
|
||||
// we register a callback to be called in the event of a fork so that we can warn the user.
|
||||
#[cfg(target_family = "unix")]
|
||||
static mut REGISTERED_FORK_CALLBACK: bool = false;
|
||||
#[cfg(target_family = "unix")]
|
||||
extern "C" fn child_after_fork() {
|
||||
use tk::parallelism::*;
|
||||
if has_parallelism_been_used() && !is_parallelism_configured() {
|
||||
@@ -70,5 +72,6 @@ pub fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_wrapped(wrap_pymodule!(processors::processors))?;
|
||||
m.add_wrapped(wrap_pymodule!(normalizers::normalizers))?;
|
||||
m.add_wrapped(wrap_pymodule!(trainers::trainers))?;
|
||||
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::hash::{Hash, Hasher};
|
||||
use numpy::{npyffi, PyArray1};
|
||||
use pyo3::class::basic::CompareOp;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::intern;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use pyo3::AsPyPointer;
|
||||
@@ -566,17 +567,23 @@ impl PyTokenizer {
|
||||
revision: String,
|
||||
auth_token: Option<String>,
|
||||
) -> PyResult<Self> {
|
||||
let params = tk::FromPretrainedParameters {
|
||||
revision,
|
||||
auth_token,
|
||||
user_agent: [("bindings", "Python"), ("version", crate::VERSION)]
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||
.collect(),
|
||||
};
|
||||
let path = Python::with_gil(|py| -> PyResult<String> {
|
||||
let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?;
|
||||
let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?;
|
||||
let kwargs = [
|
||||
(intern!(py, "repo_id"), identifier),
|
||||
(intern!(py, "filename"), "tokenizer.json"),
|
||||
(intern!(py, "revision"), &revision),
|
||||
]
|
||||
.into_py_dict(py);
|
||||
if let Some(auth_token) = auth_token {
|
||||
kwargs.set_item(intern!(py, "token"), auth_token)?;
|
||||
}
|
||||
let path: String = hf_hub_download.call((), Some(kwargs))?.extract()?;
|
||||
Ok(path)
|
||||
})?;
|
||||
|
||||
let tokenizer: PyResult<_> =
|
||||
ToPyResult(Tokenizer::from_pretrained(identifier, Some(params))).into();
|
||||
let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into();
|
||||
Ok(Self::new(tokenizer?))
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user