mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-02 15:29:21 +00:00
Python - Bump version for dev4 release
This commit is contained in:
@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
## [0.9.0-dev1]
|
## [0.9.0-dev4]
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- [#362]: Fix training deadlock with Python components.
|
- [#362]: Fix training deadlock with Python components.
|
||||||
|
2
bindings/python/Cargo.lock
generated
2
bindings/python/Cargo.lock
generated
@ -890,7 +890,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tokenizers-python"
|
name = "tokenizers-python"
|
||||||
version = "0.9.0-dev1"
|
version = "0.9.0-dev4"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"env_logger 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"env_logger 0.7.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"libc 0.2.77 (registry+https://github.com/rust-lang/crates.io-index)",
|
"libc 0.2.77 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tokenizers-python"
|
name = "tokenizers-python"
|
||||||
version = "0.9.0-dev1"
|
version = "0.9.0-dev4"
|
||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
|
38
bindings/python/examples/test.py
Normal file
38
bindings/python/examples/test.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
# from tokenizers import Tokenizer
|
||||||
|
# from tokenizers.models import BPE
|
||||||
|
# from tokenizers.pre_tokenizers import ByteLevel
|
||||||
|
# from tokenizers.normalizers import NFKC, NFC, Lowercase, Sequence
|
||||||
|
#
|
||||||
|
# tok = Tokenizer(BPE("../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt"))
|
||||||
|
# tok.pre_tokenizer = ByteLevel()
|
||||||
|
# tok.normalizer = Sequence([NFC(), NFKC()])
|
||||||
|
#
|
||||||
|
# tok.save("THE_TEST.tokenizer.json", pretty=True)
|
||||||
|
# print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens)
|
||||||
|
#
|
||||||
|
# tok = Tokenizer.from_file("THE_TEST.tokenizer.json")
|
||||||
|
# # with open("THE_TEST.tokenizer.json", "r") as f:
|
||||||
|
# # t = f.read()
|
||||||
|
# # tok = Tokenizer.from_str(t)
|
||||||
|
# print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens)
|
||||||
|
|
||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.implementations import BaseTokenizer
|
||||||
|
from transformers import PreTrainedTokenizerFast, LineByLineTextDataset
|
||||||
|
|
||||||
|
# tokenizer = Tokenizer(
|
||||||
|
# BPE("../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt")
|
||||||
|
# )
|
||||||
|
tokenizer = Tokenizer.from_file("../../data/roberta-tok.tokenizer")
|
||||||
|
print(tokenizer.encode("Hello there!").tokens)
|
||||||
|
|
||||||
|
tok_transformers = PreTrainedTokenizerFast(BaseTokenizer(tokenizer))
|
||||||
|
print(tok_transformers.tokenize("Hello there!"))
|
||||||
|
|
||||||
|
dataset = LineByLineTextDataset(tokenizer=tok_transformers, file_path="../../data/botchan.txt", block_size=12)
|
||||||
|
|
||||||
|
|
||||||
|
# tokenizer = ByteLevelBPETokenizer.from_files(
|
||||||
|
# "../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt"
|
||||||
|
# )
|
||||||
|
# print(tokenizer.encode("Hello there!").tokens)
|
@ -1,4 +1,4 @@
|
|||||||
__version__ = "0.9.0.dev1"
|
__version__ = "0.9.0.dev4"
|
||||||
|
|
||||||
from typing import Tuple, Union, Tuple, List
|
from typing import Tuple, Union, Tuple, List
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
@ -6,7 +6,7 @@ extras["testing"] = ["pytest"]
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="tokenizers",
|
name="tokenizers",
|
||||||
version="0.9.0.dev3",
|
version="0.9.0.dev4",
|
||||||
description="Fast and Customizable Tokenizers",
|
description="Fast and Customizable Tokenizers",
|
||||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
Reference in New Issue
Block a user