Move to maturing mimicking move for safetensors. + Rewritten node bindings. (#1331)

* Move to maturing mimicking move for `safetensors`. * Tmp. * Fix sdist. * Wat? * Clippy 1.72 * Remove if. * Conda sed. * Fix doc check workflow. * Moving to maturin AND removing http + openssl mess (smoothing transition moving to `huggingface_hub`) * Fix dep * Black. * New node bindings. * Fix docs + node cache ? * Yarn. * Working dir. * Extension module. * Put back interpreter. * Remove cache. * New attempt * Multi python. * Remove FromPretrained. * Remove traces of `fromPretrained`. * Drop 3.12 for windows? * Typo. * Put back the default feature for ignoring links during simple test. * Fix ? * x86_64 -> x64. * Remove warning for windows bindings. * Excluse aarch. * Include/exclude. * Put back workflows in correct states.
2025-12-03 11:18:29 +00:00 · 2023-08-28 16:24:14 +02:00
parent f2952020d5
commit d2010d5165
155 changed files with 12988 additions and 16409 deletions
--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tokenizers-python"
-version = "0.13.4"
+version = "0.14.0-dev.0"
 authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
 edition = "2021"

@@ -14,19 +14,21 @@ serde = { version = "1.0", features = [ "rc", "derive" ]}
 serde_json = "1.0"
 libc = "0.2"
 env_logger = "0.7.1"
-pyo3 = "0.19"
+pyo3 = { version = "0.19" }
 numpy = "0.19.0"
 ndarray = "0.13"
 onig = { version = "6.0", default-features = false }
 itertools = "0.9"

 [dependencies.tokenizers]
-version = "*"
+version = "0.14.0-dev.0"
 path = "../../tokenizers"
+default-features = false
+features = ["onig"]

 [dev-dependencies]
 tempfile = "3.1"
 pyo3 = { version = "0.19", features = ["auto-initialize"] }

 [features]
-default = ["pyo3/extension-module"]
+defaut = ["pyo3/extension-module"]
--- a/bindings/python/build-sdist.sh
+++ b/bindings/python/build-sdist.sh
@@ -1,9 +0,0 @@
-#!/bin/bash
-set -ex
-
-# Create a symlink for tokenizers-lib
-ln -sf ../../tokenizers tokenizers-lib
-# Modify cargo.toml to include this symlink
-sed -i 's/\.\.\/\.\.\/tokenizers/\.\/tokenizers-lib/' Cargo.toml
-# Build the source distribution
-python setup.py sdist
--- a/bindings/python/py_src/tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/init.py
@@ -1,5 +1,3 @@
-__version__ = "0.13.4.rc2"
-
 from enum import Enum
 from typing import List, Tuple, Union

@@ -91,6 +89,7 @@ from .tokenizers import (
    pre_tokenizers,
    processors,
    trainers,
+    __version__,
 )
 from .implementations import (
    BertWordPieceTokenizer,
--- a/bindings/python/pyproject.toml
+++ b/bindings/python/pyproject.toml
@@ -1,7 +1,54 @@
+[project]
+name = 'tokenizers'
+requires-python = '>=3.7'
+authors = [
+    {name = 'Nicolas Patry', email = 'patry.nicolas@protonmail.com'},
+    {name = 'Anthony Moi', email = 'anthony@huggingface.co'}
+]
+classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Intended Audience :: Developers",
+  "Intended Audience :: Education",
+  "Intended Audience :: Science/Research",
+  "License :: OSI Approved :: Apache Software License",
+  "Operating System :: OS Independent",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.7",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+keywords = ["NLP", "tokenizer", "BPE", "transformer", "deep learning"]
+dynamic = [
+    'description',
+    'license',
+    'readme',
+]
+dependencies = ["huggingface_hub>=0.16.4,<0.17"]
+
+[project.urls]
+Homepage = 'https://github.com/huggingface/tokenizers'
+Source = 'https://github.com/huggingface/tokenizers'
+
+
+[project.optional-dependencies]
+testing = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
+docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
+dev = ["tokenizers[testing]"]
+
+
 [build-system]
-requires = ["setuptools", "wheel", "setuptools-rust"]
-build-backend = "setuptools.build_meta"
+requires = ["maturin>=1.0,<2.0"]
+build-backend = "maturin"
+
+[tool.maturin]
+python-source = "py_src"
+module-name = "tokenizers.tokenizers"
+bindings = 'pyo3'
+features = ["pyo3/extension-module"]

 [tool.black]
-target-version = ['py35']
 line-length = 119
+target-version = ['py35']
--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@@ -1,66 +0,0 @@
-from setuptools import setup
-from setuptools_rust import Binding, RustExtension
-
-
-extras = {}
-extras["testing"] = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
-extras["docs"] = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
-extras["dev"] = extras["testing"]
-
-with open("py_src/tokenizers/__init__.py", "r") as f:
-    version = f.readline().split("=")[-1].strip().strip('"')
-
-setup(
-    name="tokenizers",
-    version=version,
-    description="Fast and Customizable Tokenizers",
-    long_description=open("README.md", "r", encoding="utf-8").read(),
-    long_description_content_type="text/markdown",
-    keywords="NLP tokenizer BPE transformer deep learning",
-    author="Anthony MOI",
-    author_email="anthony@huggingface.co",
-    url="https://github.com/huggingface/tokenizers",
-    license="Apache License 2.0",
-    rust_extensions=[RustExtension("tokenizers.tokenizers", binding=Binding.PyO3, debug=False)],
-    extras_require=extras,
-    classifiers=[
-        "Development Status :: 5 - Production/Stable",
-        "Intended Audience :: Developers",
-        "Intended Audience :: Education",
-        "Intended Audience :: Science/Research",
-        "License :: OSI Approved :: Apache Software License",
-        "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3",
-        "Programming Language :: Python :: 3.5",
-        "Programming Language :: Python :: 3.6",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
-        "Programming Language :: Python :: 3.9",
-        "Programming Language :: Python :: 3.10",
-        "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    ],
-    package_dir={"": "py_src"},
-    packages=[
-        "tokenizers",
-        "tokenizers.models",
-        "tokenizers.decoders",
-        "tokenizers.normalizers",
-        "tokenizers.pre_tokenizers",
-        "tokenizers.processors",
-        "tokenizers.trainers",
-        "tokenizers.implementations",
-        "tokenizers.tools",
-    ],
-    package_data={
-        "tokenizers": ["py.typed", "__init__.pyi"],
-        "tokenizers.models": ["py.typed", "__init__.pyi"],
-        "tokenizers.decoders": ["py.typed", "__init__.pyi"],
-        "tokenizers.normalizers": ["py.typed", "__init__.pyi"],
-        "tokenizers.pre_tokenizers": ["py.typed", "__init__.pyi"],
-        "tokenizers.processors": ["py.typed", "__init__.pyi"],
-        "tokenizers.trainers": ["py.typed", "__init__.pyi"],
-        "tokenizers.implementations": ["py.typed"],
-        "tokenizers.tools": ["py.typed", "visualizer-styles.css"],
-    },
-    zip_safe=False,
-)
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -25,7 +25,9 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 // For users using multiprocessing in python, it is quite easy to fork the process running
 // tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
 // we register a callback to be called in the event of a fork so that we can warn the user.
+#[cfg(target_family = "unix")]
 static mut REGISTERED_FORK_CALLBACK: bool = false;
+#[cfg(target_family = "unix")]
 extern "C" fn child_after_fork() {
    use tk::parallelism::*;
    if has_parallelism_been_used() && !is_parallelism_configured() {
@@ -70,5 +72,6 @@ pub fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_wrapped(wrap_pymodule!(processors::processors))?;
    m.add_wrapped(wrap_pymodule!(normalizers::normalizers))?;
    m.add_wrapped(wrap_pymodule!(trainers::trainers))?;
+    m.add("__version__", env!("CARGO_PKG_VERSION"))?;
    Ok(())
 }
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -4,6 +4,7 @@ use std::hash::{Hash, Hasher};
 use numpy::{npyffi, PyArray1};
 use pyo3::class::basic::CompareOp;
 use pyo3::exceptions;
+use pyo3::intern;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use pyo3::AsPyPointer;
@@ -566,17 +567,23 @@ impl PyTokenizer {
        revision: String,
        auth_token: Option<String>,
    ) -> PyResult<Self> {
-        let params = tk::FromPretrainedParameters {
-            revision,
-            auth_token,
-            user_agent: [("bindings", "Python"), ("version", crate::VERSION)]
-                .iter()
-                .map(|(k, v)| (k.to_string(), v.to_string()))
-                .collect(),
-        };
+        let path = Python::with_gil(|py| -> PyResult<String> {
+            let huggingface_hub = PyModule::import(py, intern!(py, "huggingface_hub"))?;
+            let hf_hub_download = huggingface_hub.getattr(intern!(py, "hf_hub_download"))?;
+            let kwargs = [
+                (intern!(py, "repo_id"), identifier),
+                (intern!(py, "filename"), "tokenizer.json"),
+                (intern!(py, "revision"), &revision),
+            ]
+            .into_py_dict(py);
+            if let Some(auth_token) = auth_token {
+                kwargs.set_item(intern!(py, "token"), auth_token)?;
+            }
+            let path: String = hf_hub_download.call((), Some(kwargs))?.extract()?;
+            Ok(path)
+        })?;

-        let tokenizer: PyResult<_> =
-            ToPyResult(Tokenizer::from_pretrained(identifier, Some(params))).into();
+        let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into();
        Ok(Self::new(tokenizer?))
    }