tokenizers/bindings/python/Makefile

.PHONY: style check-style test

DATA_DIR = data

dir_guard=@mkdir -p $(@D)
check_dirs := examples py_src/tokenizers tests

# Format source code automatically
style:
	python stub.py
	ruff check  $(check_dirs) --fix
	ruff format $(check_dirs)

# Check the source code is formatted correctly
check-style:
	python stub.py --check
	ruff check examples py_src/tokenizers tests
	ruff format --check examples py_src/tokenizers tests

TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

# Launch the test suite
test: $(TESTS_RESOURCES)
	pip install pytest requests setuptools_rust numpy pyarrow datasets
	python -m pytest -s -v tests
	cargo test --no-default-features

$(DATA_DIR)/big.txt :
	$(dir_guard)
	wget https://norvig.com/big.txt -O $@

$(DATA_DIR)/small.txt : $(DATA_DIR)/big.txt
	head -100 $(DATA_DIR)/big.txt > $@

$(DATA_DIR)/roberta.json :
	$(dir_guard)
	wget https://huggingface.co/roberta-large/raw/main/tokenizer.json -O $@