mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* Removing all pre_tokenizer logic from Unigram algorithm. * Improving *a lot* the parity check. - We can now detect a lot more errors - Special cases have been added temporarily. * Adding 2 new normalizers that mimick spm defaut's behavior. * Adding `encoding_optimized` version of the `encode` algorithm. - Removes Lattice allocation. - Changes trie `common_prefix_search` to return an iterator to avoid allocation of the full results. * Trie<char> -> Trie<u8> Another improvement on speed. * [WIP] Attempt to create a Precompiled Normalizer from SPM to be 100% compliant with arbitrary models. * Adding a new `Precompiled` Normalizer that is replacing `SpmNmtNfkc`. - It will be used for direct compatiblity with `Spm` and replace all their custom rules by using directly the normalizer spec embedded within spm files, removing all need for any rules for us. - We need `nom` dependency to parse the binary format of `spm`. - We need to add `sentencepiece_model_pb2.py` file to be able to read the proto file. - We reimplemented their `Darts::DoubleArray` compact trie format. * Fixing a bug with Precompiled normalizer. * Fixing some edge cases (now in tests) with this weird precompiled normalizer. It seems a very handy crafted trie does not prevent from shooting oneself in the foot. Sorry future reader. * Keep API stable for this PR (change of the API should come later #409). - Removed sentencepiece_model_pb2 from binding and add instructions to make `from_spm` work. * Adding model check in `from_spm`. * Adressing @n1t0's comments. * Adding a check to make sure alignments stay correct. Also added a bit more documentation on how Precompiled works. * Extracting `Precompiled` into it's own `spm_precompiled` crate. * Using ranges in `do_nmt`.
57 lines
2.1 KiB
Python
57 lines
2.1 KiB
Python
from setuptools import setup
|
|
from setuptools_rust import Binding, RustExtension
|
|
|
|
extras = {}
|
|
extras["testing"] = ["pytest"]
|
|
|
|
setup(
|
|
name="tokenizers",
|
|
version="0.9.0.dev1",
|
|
description="Fast and Customizable Tokenizers",
|
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
|
long_description_content_type="text/markdown",
|
|
keywords="NLP tokenizer BPE transformer deep learning",
|
|
author="Anthony MOI",
|
|
author_email="anthony@huggingface.co",
|
|
url="https://github.com/huggingface/tokenizers",
|
|
license="Apache License 2.0",
|
|
rust_extensions=[RustExtension("tokenizers.tokenizers", binding=Binding.PyO3, debug=False)],
|
|
extras_require=extras,
|
|
classifiers=[
|
|
"Development Status :: 5 - Production/Stable",
|
|
"Intended Audience :: Developers",
|
|
"Intended Audience :: Education",
|
|
"Intended Audience :: Science/Research",
|
|
"License :: OSI Approved :: Apache Software License",
|
|
"Operating System :: OS Independent",
|
|
"Programming Language :: Python :: 3",
|
|
"Programming Language :: Python :: 3.5",
|
|
"Programming Language :: Python :: 3.6",
|
|
"Programming Language :: Python :: 3.7",
|
|
"Programming Language :: Python :: 3.8",
|
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
],
|
|
package_dir={"": "py_src"},
|
|
packages=[
|
|
"tokenizers",
|
|
"tokenizers.models",
|
|
"tokenizers.decoders",
|
|
"tokenizers.normalizers",
|
|
"tokenizers.pre_tokenizers",
|
|
"tokenizers.processors",
|
|
"tokenizers.trainers",
|
|
"tokenizers.implementations",
|
|
],
|
|
package_data={
|
|
"tokenizers": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.models": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.decoders": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.normalizers": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.pre_tokenizers": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.processors": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.trainers": ["py.typed", "__init__.pyi"],
|
|
"tokenizers.implementations": ["py.typed"],
|
|
},
|
|
zip_safe=False,
|
|
)
|