From 171a042ee00f9e63f04aa1482ef46235c1e4131b Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Thu, 24 Sep 2020 10:16:18 -0400 Subject: [PATCH] Python - Bump version for dev4 release --- bindings/python/CHANGELOG.md | 2 +- bindings/python/Cargo.lock | 2 +- bindings/python/Cargo.toml | 2 +- bindings/python/examples/test.py | 38 +++++++++++++++++++ bindings/python/py_src/tokenizers/__init__.py | 2 +- bindings/python/setup.py | 2 +- 6 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 bindings/python/examples/test.py diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index ec66c02c..6117c465 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [0.9.0-dev1] +## [0.9.0-dev4] ### Fixed - [#362]: Fix training deadlock with Python components. diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 3e7ed0ab..175a2346 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -890,7 +890,7 @@ dependencies = [ [[package]] name = "tokenizers-python" -version = "0.9.0-dev1" +version = "0.9.0-dev4" dependencies = [ "env_logger 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)", "libc 0.2.77 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index afa499b6..86b26bbb 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tokenizers-python" -version = "0.9.0-dev1" +version = "0.9.0-dev4" authors = ["Anthony MOI "] edition = "2018" diff --git a/bindings/python/examples/test.py b/bindings/python/examples/test.py new file mode 100644 index 00000000..194a3751 --- /dev/null +++ b/bindings/python/examples/test.py @@ -0,0 +1,38 @@ +# from tokenizers import Tokenizer +# from tokenizers.models import BPE +# from tokenizers.pre_tokenizers import ByteLevel +# from tokenizers.normalizers import NFKC, NFC, Lowercase, Sequence +# +# tok = Tokenizer(BPE("../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt")) +# tok.pre_tokenizer = ByteLevel() +# tok.normalizer = Sequence([NFC(), NFKC()]) +# +# tok.save("THE_TEST.tokenizer.json", pretty=True) +# print(tok.encode("π•Ώπ–π–Š π––π–šπ–Žπ–ˆπ–, π–‡π–—π–”π–œπ–“ 🦊 π–π–šπ–’π–•π–˜ π–”π–›π–Šπ–— π–™π–π–Š π–‘π–†π–Ÿπ–ž 🐢").tokens) +# +# tok = Tokenizer.from_file("THE_TEST.tokenizer.json") +# # with open("THE_TEST.tokenizer.json", "r") as f: +# # t = f.read() +# # tok = Tokenizer.from_str(t) +# print(tok.encode("π•Ώπ–π–Š π––π–šπ–Žπ–ˆπ–, π–‡π–—π–”π–œπ–“ 🦊 π–π–šπ–’π–•π–˜ π–”π–›π–Šπ–— π–™π–π–Š π–‘π–†π–Ÿπ–ž 🐢").tokens) + +from tokenizers import Tokenizer +from tokenizers.implementations import BaseTokenizer +from transformers import PreTrainedTokenizerFast, LineByLineTextDataset + +# tokenizer = Tokenizer( +# BPE("../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt") +# ) +tokenizer = Tokenizer.from_file("../../data/roberta-tok.tokenizer") +print(tokenizer.encode("Hello there!").tokens) + +tok_transformers = PreTrainedTokenizerFast(BaseTokenizer(tokenizer)) +print(tok_transformers.tokenize("Hello there!")) + +dataset = LineByLineTextDataset(tokenizer=tok_transformers, file_path="../../data/botchan.txt", block_size=12) + + +# tokenizer = ByteLevelBPETokenizer.from_files( +# "../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt" +# ) +# print(tokenizer.encode("Hello there!").tokens) diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index d43e3a35..5776263d 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.9.0.dev1" +__version__ = "0.9.0.dev4" from typing import Tuple, Union, Tuple, List from enum import Enum diff --git a/bindings/python/setup.py b/bindings/python/setup.py index aca7e6ed..c852d7d8 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -6,7 +6,7 @@ extras["testing"] = ["pytest"] setup( name="tokenizers", - version="0.9.0.dev3", + version="0.9.0.dev4", description="Fast and Customizable Tokenizers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown",