mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* add a way to specify the unknown token in `SentencePieceUnigramTokenizer` * add test that verify that an exception is raised for the missing unknown token * style * add test tokens
59 lines
2.2 KiB
Python
59 lines
2.2 KiB
Python
from setuptools import setup
|
|
from setuptools_rust import Binding, RustExtension
|
|
|
|
extras = {}
|
|
extras["testing"] = ["pytest", "requests", "numpy", "datasets"]
|
|
|
|
setup(
|
|
name="tokenizers",
|
|
version="0.10.3",
|
|
description="Fast and Customizable Tokenizers",
|
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
|
long_description_content_type="text/markdown",
|
|
keywords="NLP tokenizer BPE transformer deep learning",
|
|
author="Anthony MOI",
|
|
author_email="anthony@huggingface.co",
|
|
url="https://github.com/huggingface/tokenizers",
|
|
license="Apache License 2.0",
|
|
rust_extensions=[RustExtension("tokenizers.tokenizers", binding=Binding.PyO3, debug=False)],
|
|
extras_require=extras,
|
|
classifiers=[
|
|
"Development Status :: 5 - Production/Stable",
|
|
"Intended Audience :: Developers",
|
|
"Intended Audience :: Education",
|
|
"Intended Audience :: Science/Research",
|
|
"License :: OSI Approved :: Apache Software License",
|
|
"Operating System :: OS Independent",
|
|
"Programming Language :: Python :: 3",
|
|
"Programming Language :: Python :: 3.5",
|
|
"Programming Language :: Python :: 3.6",
|
|
"Programming Language :: Python :: 3.7",
|
|
"Programming Language :: Python :: 3.8",
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
],
|
|
package_dir={"": "py_src"},
|
|
packages=[
|
|
"tokenizers",
|
|
"tokenizers.models",
|
|
"tokenizers.decoders",
|
|
"tokenizers.normalizers",
|
|
"tokenizers.pre_tokenizers",
|
|
"tokenizers.processors",
|
|
"tokenizers.trainers",
|
|
"tokenizers.implementations",
|
|
"tokenizers.tools",
|
|
],
|
|
package_data={
|
|
"tokenizers": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.models": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.decoders": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.normalizers": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.pre_tokenizers": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.processors": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.trainers": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.implementations": ["py.typed"],
|
|
"tokenizers.tools": ["py.typed", "visualizer-styles.css"],
|
|
},
|
|
zip_safe=False,
|
|
)
|