🚨 Support updating template processors (#1652)

* current updates

* simplify

* set_item works, but `tokenizer._tokenizer.post_processor[1].single = ["$0", "</s>"]` does not !

* fix: `normalizers` deserialization and other refactoring

* fix: `pre_tokenizer` deserialization

* feat: add `__len__` implementation for `normalizer::PySequence`

* feat: add `__setitem__` impl for `normalizers::PySequence`

* feat: add `__setitem__` impl to `pre_tokenizer::PySequence`

* feat: add `__setitem__` impl to `post_processor::PySequence`

* test: add normalizer sequence setter check

* refactor: allow unused `processors::setter` macro

* test: add `__setitem__` test for processors & pretok

* refactor: `unwrap` -> `PyException::new_err()?`

* refactor: fmt

* refactor: remove unnecessary `pub`

* feat(bindings): add missing getters & setters for pretoks

* feat(bindings): add missing getters & setters for processors

* refactor(bindings): rewrite RwLock poison error msg

* refactor: remove debug print

* feat(bindings): add description as to why custom deser is needed

* feat: make post proc sequence elements mutable

* fix(binding): serialization

---------

Co-authored-by: Luc Georges <luc.sydney.georges@gmail.com>
This commit is contained in:
Arthur
2025-01-28 14:58:35 +01:00
committed by GitHub
parent e7ed39de3c
commit c45aebd102
22 changed files with 1013 additions and 181 deletions

View File

@ -2,8 +2,8 @@
name = 'tokenizers'
requires-python = '>=3.9'
authors = [
{name = 'Nicolas Patry', email = 'patry.nicolas@protonmail.com'},
{name = 'Anthony Moi', email = 'anthony@huggingface.co'}
{ name = 'Nicolas Patry', email = 'patry.nicolas@protonmail.com' },
{ name = 'Anthony Moi', email = 'anthony@huggingface.co' },
]
classifiers = [
"Development Status :: 5 - Production/Stable",
@ -21,12 +21,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
keywords = ["NLP", "tokenizer", "BPE", "transformer", "deep learning"]
dynamic = [
'description',
'license',
'readme',
'version',
]
dynamic = ['description', 'license', 'readme', 'version']
dependencies = ["huggingface_hub>=0.16.4,<1.0"]
[project.urls]
@ -58,16 +53,16 @@ target-version = ['py35']
line-length = 119
target-version = "py311"
lint.ignore = [
# a == None in tests vs is None.
"E711",
# a == False in tests vs is False.
"E712",
# try.. import except.. pattern without using the lib.
"F401",
# Raw type equality is required in asserts
"E721",
# Import order
"E402",
# Fixtures unused import
"F811",
# a == None in tests vs is None.
"E711",
# a == False in tests vs is False.
"E712",
# try.. import except.. pattern without using the lib.
"F401",
# Raw type equality is required in asserts
"E721",
# Import order
"E402",
# Fixtures unused import
"F811",
]