mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
* Rust - add a CTCDecoder as a seperate mod * Adding bindings to Node + Python. * Clippy update. * Stub. * Fixing roberta.json URLs. * Moving test files to hf.co. * Update cargo check and clippy to 1.52. * Inner ':' actually is used for domains in sphinx. Making `domain` work correctly was just too much work so I went the easy way and have global roles for the custom rust extension. * Update struct naming and docs * Update changelog Co-authored-by: Thomaub <github.thomaub@gmail.com> Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
39 lines
968 B
Makefile
39 lines
968 B
Makefile
.PHONY: style check-style test
|
|
|
|
DATA_DIR = data
|
|
|
|
dir_guard=@mkdir -p $(@D)
|
|
|
|
# Format source code automatically
|
|
style:
|
|
npm run lint
|
|
|
|
# Check the source code is formatted correctly
|
|
check-style:
|
|
npm run lint-check
|
|
|
|
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
|
|
|
|
# Launch the test suite
|
|
test: $(TESTS_RESOURCES)
|
|
npm run test
|
|
|
|
$(DATA_DIR)/big.txt :
|
|
$(dir_guard)
|
|
wget https://norvig.com/big.txt -O $@
|
|
|
|
$(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
|
|
head -100 $(DATA_DIR)/big.txt > $@
|
|
|
|
$(DATA_DIR)/roberta.json :
|
|
$(dir_guard)
|
|
wget https://huggingface.co/roberta-large/raw/main/tokenizer.json -O $@
|
|
|
|
$(DATA_DIR)/tokenizer-wiki.json :
|
|
$(dir_guard)
|
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-quicktour/tokenizer.json -O $@
|
|
|
|
$(DATA_DIR)/bert-wiki.json :
|
|
$(dir_guard)
|
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/anthony/doc-pipeline/tokenizer.json -O $@
|