mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-09 14:18:30 +00:00
22
.github/workflows/python.yml
vendored
22
.github/workflows/python.yml
vendored
@@ -59,16 +59,24 @@ jobs:
|
|||||||
path: ./bindings/python/target
|
path: ./bindings/python/target
|
||||||
key: ${{ runner.os }}-cargo-python-build-${{ hashFiles('**/Cargo.toml') }}
|
key: ${{ runner.os }}-cargo-python-build-${{ hashFiles('**/Cargo.toml') }}
|
||||||
|
|
||||||
- name: Build
|
|
||||||
uses: actions-rs/cargo@v1
|
|
||||||
with:
|
|
||||||
toolchain: nightly
|
|
||||||
command: build
|
|
||||||
args: --verbose --manifest-path ./bindings/python/Cargo.toml
|
|
||||||
|
|
||||||
- name: Lint with RustFmt
|
- name: Lint with RustFmt
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
toolchain: nightly
|
toolchain: nightly
|
||||||
command: fmt
|
command: fmt
|
||||||
args: --manifest-path ./bindings/python/Cargo.toml -- --check
|
args: --manifest-path ./bindings/python/Cargo.toml -- --check
|
||||||
|
|
||||||
|
- name: Install Python
|
||||||
|
uses: actions/setup-python@v1
|
||||||
|
with:
|
||||||
|
python-version: 3.6
|
||||||
|
architecture: "x64"
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
working-directory: ./bindings/python
|
||||||
|
run: |
|
||||||
|
python -m venv .env
|
||||||
|
source .env/bin/activate
|
||||||
|
pip install pytest requests maturin
|
||||||
|
maturin develop --release
|
||||||
|
make test
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,6 +8,7 @@ Cargo.lock
|
|||||||
|
|
||||||
/data
|
/data
|
||||||
tokenizers/data
|
tokenizers/data
|
||||||
|
bindings/python/tests/data
|
||||||
/docs
|
/docs
|
||||||
|
|
||||||
__pycache__
|
__pycache__
|
||||||
|
|||||||
@@ -1,9 +1,13 @@
|
|||||||
.PHONY: style check-style
|
.PHONY: style check-style test
|
||||||
|
|
||||||
# Format source code automatically
|
# Format source code automatically
|
||||||
|
|
||||||
style:
|
style:
|
||||||
black --line-length 100 --target-version py35 examples tokenizers
|
black --line-length 100 --target-version py35 examples tokenizers tests
|
||||||
|
|
||||||
|
# Check the source code is formatted correctly
|
||||||
check-style:
|
check-style:
|
||||||
black --check --line-length 100 --target-version py35 examples tokenizers
|
black --check --line-length 100 --target-version py35 examples tokenizers tests
|
||||||
|
|
||||||
|
# Launch the test suite
|
||||||
|
test:
|
||||||
|
python -m pytest -s -v tests
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
from setuptools import setup
|
from setuptools import setup
|
||||||
from setuptools_rust import Binding, RustExtension
|
from setuptools_rust import Binding, RustExtension
|
||||||
|
|
||||||
|
extras = {}
|
||||||
|
extras["testing"] = ["pytest"]
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="tokenizers",
|
name="tokenizers",
|
||||||
version="0.7.0-rc3",
|
version="0.7.0-rc3",
|
||||||
@@ -13,6 +16,7 @@ setup(
|
|||||||
url="https://github.com/huggingface/tokenizers",
|
url="https://github.com/huggingface/tokenizers",
|
||||||
license="Apache License 2.0",
|
license="Apache License 2.0",
|
||||||
rust_extensions=[RustExtension("tokenizers.tokenizers", binding=Binding.PyO3)],
|
rust_extensions=[RustExtension("tokenizers.tokenizers", binding=Binding.PyO3)],
|
||||||
|
extras_require=extras,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
"Development Status :: 5 - Production/Stable",
|
"Development Status :: 5 - Production/Stable",
|
||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
|
|||||||
@@ -107,7 +107,7 @@ impl BPEDecoder {
|
|||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[args(kwargs = "**")]
|
||||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||||
let mut suffix = String::from("</w");
|
let mut suffix = String::from("</w>");
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
for (key, value) in kwargs {
|
for (key, value) in kwargs {
|
||||||
|
|||||||
@@ -288,4 +288,11 @@ impl WordLevel {
|
|||||||
}),
|
}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[staticmethod]
|
||||||
|
fn empty() -> Model {
|
||||||
|
Model {
|
||||||
|
model: Container::Owned(Box::new(tk::models::wordlevel::WordLevel::default())),
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,6 +46,26 @@ impl AddedToken {
|
|||||||
obj.init({ AddedToken { token } });
|
obj.init({ AddedToken { token } });
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_content(&self) -> &str {
|
||||||
|
&self.token.content
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_rstrip(&self) -> bool {
|
||||||
|
self.token.rstrip
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_lstrip(&self) -> bool {
|
||||||
|
self.token.lstrip
|
||||||
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_single_word(&self) -> bool {
|
||||||
|
self.token.single_word
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#[pyproto]
|
#[pyproto]
|
||||||
impl PyObjectProtocol for AddedToken {
|
impl PyObjectProtocol for AddedToken {
|
||||||
@@ -54,9 +74,17 @@ impl PyObjectProtocol for AddedToken {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn __repr__(&self) -> PyResult<String> {
|
fn __repr__(&self) -> PyResult<String> {
|
||||||
|
let bool_to_python = |p| match p {
|
||||||
|
true => "True",
|
||||||
|
false => "False",
|
||||||
|
};
|
||||||
|
|
||||||
Ok(format!(
|
Ok(format!(
|
||||||
"AddedToken(\"{}\", rstrip={}, lstrip={}, single_word={})",
|
"AddedToken(\"{}\", rstrip={}, lstrip={}, single_word={})",
|
||||||
self.token.content, self.token.rstrip, self.token.lstrip, self.token.single_word
|
self.token.content,
|
||||||
|
bool_to_python(self.token.rstrip),
|
||||||
|
bool_to_python(self.token.lstrip),
|
||||||
|
bool_to_python(self.token.single_word)
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
0
bindings/python/tests/__init__.py
Normal file
0
bindings/python/tests/__init__.py
Normal file
0
bindings/python/tests/bindings/__init__.py
Normal file
0
bindings/python/tests/bindings/__init__.py
Normal file
61
bindings/python/tests/bindings/test_decoders.py
Normal file
61
bindings/python/tests/bindings/test_decoders.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
|
||||||
|
|
||||||
|
|
||||||
|
class TestByteLevel:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert ByteLevel() is not None
|
||||||
|
assert isinstance(ByteLevel(), Decoder)
|
||||||
|
|
||||||
|
def test_decoding(self):
|
||||||
|
decoder = ByteLevel()
|
||||||
|
assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John"
|
||||||
|
|
||||||
|
|
||||||
|
class TestWordPiece:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert WordPiece() is not None
|
||||||
|
assert WordPiece(prefix="__") is not None
|
||||||
|
assert WordPiece(cleanup=True) is not None
|
||||||
|
assert isinstance(WordPiece(), Decoder)
|
||||||
|
|
||||||
|
def test_decoding(self):
|
||||||
|
decoder = WordPiece()
|
||||||
|
assert decoder.decode(["My", "na", "##me", "is", "Jo", "##hn"]) == "My name is John"
|
||||||
|
assert decoder.decode(["I", "'m", "Jo", "##hn"]) == "I'm John"
|
||||||
|
decoder = WordPiece(prefix="__", cleanup=False)
|
||||||
|
assert decoder.decode(["My", "na", "__me", "is", "Jo", "__hn"]) == "My name is John"
|
||||||
|
assert decoder.decode(["I", "'m", "Jo", "__hn"]) == "I 'm John"
|
||||||
|
|
||||||
|
|
||||||
|
class TestMetaspace:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert Metaspace() is not None
|
||||||
|
assert Metaspace(replacement="-") is not None
|
||||||
|
with pytest.raises(Exception, match="replacement must be a character"):
|
||||||
|
Metaspace(replacement="")
|
||||||
|
assert Metaspace(add_prefix_space=True) is not None
|
||||||
|
assert isinstance(Metaspace(), Decoder)
|
||||||
|
|
||||||
|
def test_decoding(self):
|
||||||
|
decoder = Metaspace()
|
||||||
|
assert decoder.decode(["▁My", "▁name", "▁is", "▁John"]) == "My name is John"
|
||||||
|
decoder = Metaspace(replacement="-", add_prefix_space=False)
|
||||||
|
assert decoder.decode(["-My", "-name", "-is", "-John"]) == " My name is John"
|
||||||
|
|
||||||
|
|
||||||
|
class TestBPEDecoder:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert BPEDecoder() is not None
|
||||||
|
assert BPEDecoder(suffix="_") is not None
|
||||||
|
assert isinstance(BPEDecoder(), Decoder)
|
||||||
|
|
||||||
|
def test_decoding(self):
|
||||||
|
decoder = BPEDecoder()
|
||||||
|
assert (
|
||||||
|
decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"])
|
||||||
|
== "My name is John"
|
||||||
|
)
|
||||||
|
decoder = BPEDecoder(suffix="_")
|
||||||
|
assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == "My name is John"
|
||||||
23
bindings/python/tests/bindings/test_models.py
Normal file
23
bindings/python/tests/bindings/test_models.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
from ..utils import data_dir, roberta_files, bert_files
|
||||||
|
|
||||||
|
from tokenizers.models import Model, BPE, WordPiece, WordLevel
|
||||||
|
|
||||||
|
|
||||||
|
class TestBPE:
|
||||||
|
def test_instantiate(self, roberta_files):
|
||||||
|
assert isinstance(BPE.empty(), Model)
|
||||||
|
assert isinstance(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]), Model)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWordPiece:
|
||||||
|
def test_instantiate(self, bert_files):
|
||||||
|
assert isinstance(WordPiece.empty(), Model)
|
||||||
|
assert isinstance(WordPiece.from_files(bert_files["vocab"]), Model)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWordLevel:
|
||||||
|
def test_instantiate(self, roberta_files):
|
||||||
|
assert isinstance(WordLevel.empty(), Model)
|
||||||
|
# The WordLevel model expects a vocab.json using the same format as roberta
|
||||||
|
# so we can just try to load with this file
|
||||||
|
assert isinstance(WordLevel.from_files(roberta_files["vocab"]), Model)
|
||||||
82
bindings/python/tests/bindings/test_normalizers.py
Normal file
82
bindings/python/tests/bindings/test_normalizers.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import BPE
|
||||||
|
from tokenizers.normalizers import BertNormalizer, Sequence, Lowercase, Strip
|
||||||
|
|
||||||
|
|
||||||
|
class TestBertNormalizer:
|
||||||
|
def test_strip_accents(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = BertNormalizer(
|
||||||
|
strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tokenizer.normalize("Héllò")
|
||||||
|
assert output == "Hello"
|
||||||
|
|
||||||
|
def test_handle_chinese_chars(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = BertNormalizer(
|
||||||
|
strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tokenizer.normalize("你好")
|
||||||
|
assert output == " 你 好 "
|
||||||
|
|
||||||
|
def test_clean_text(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = BertNormalizer(
|
||||||
|
strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tokenizer.normalize("\ufeffHello")
|
||||||
|
assert output == "Hello"
|
||||||
|
|
||||||
|
def test_lowercase(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = BertNormalizer(
|
||||||
|
strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False
|
||||||
|
)
|
||||||
|
|
||||||
|
output = tokenizer.normalize("Héllò")
|
||||||
|
assert output == "héllò"
|
||||||
|
|
||||||
|
|
||||||
|
class TestSequence:
|
||||||
|
def test_can_make_sequences(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = Sequence([Lowercase(), Strip()])
|
||||||
|
|
||||||
|
output = tokenizer.normalize(" HELLO ")
|
||||||
|
assert output == "hello"
|
||||||
|
|
||||||
|
|
||||||
|
class TestLowercase:
|
||||||
|
def test_lowercase(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = Lowercase()
|
||||||
|
|
||||||
|
output = tokenizer.normalize("HELLO")
|
||||||
|
assert output == "hello"
|
||||||
|
|
||||||
|
|
||||||
|
class TestStrip:
|
||||||
|
def test_left_strip(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = Strip(left=True, right=False)
|
||||||
|
|
||||||
|
output = tokenizer.normalize(" hello ")
|
||||||
|
assert output == "hello "
|
||||||
|
|
||||||
|
def test_right_strip(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = Strip(left=False, right=True)
|
||||||
|
|
||||||
|
output = tokenizer.normalize(" hello ")
|
||||||
|
assert output == " hello"
|
||||||
|
|
||||||
|
def test_full_strip(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.normalizer = Strip(left=True, right=True)
|
||||||
|
|
||||||
|
output = tokenizer.normalize(" hello ")
|
||||||
|
assert output == "hello"
|
||||||
59
bindings/python/tests/bindings/test_pre_tokenizers.py
Normal file
59
bindings/python/tests/bindings/test_pre_tokenizers.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from tokenizers.pre_tokenizers import (
|
||||||
|
PreTokenizer,
|
||||||
|
ByteLevel,
|
||||||
|
Whitespace,
|
||||||
|
WhitespaceSplit,
|
||||||
|
BertPreTokenizer,
|
||||||
|
Metaspace,
|
||||||
|
CharDelimiterSplit,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestByteLevel:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert ByteLevel() is not None
|
||||||
|
assert ByteLevel(add_prefix_space=True) is not None
|
||||||
|
assert ByteLevel(add_prefix_space=False) is not None
|
||||||
|
assert isinstance(ByteLevel(), PreTokenizer)
|
||||||
|
|
||||||
|
def test_has_alphabet(self):
|
||||||
|
assert isinstance(ByteLevel.alphabet(), list)
|
||||||
|
assert len(ByteLevel.alphabet()) == 256
|
||||||
|
|
||||||
|
|
||||||
|
class TestWhitespace:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert Whitespace() is not None
|
||||||
|
assert isinstance(Whitespace(), PreTokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
class TestWhitespaceSplit:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert WhitespaceSplit() is not None
|
||||||
|
assert isinstance(WhitespaceSplit(), PreTokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
class TestBertPreTokenizer:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert BertPreTokenizer() is not None
|
||||||
|
assert isinstance(BertPreTokenizer(), PreTokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMetaspace:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert Metaspace() is not None
|
||||||
|
assert Metaspace(replacement="-") is not None
|
||||||
|
with pytest.raises(Exception, match="replacement must be a character"):
|
||||||
|
Metaspace(replacement="")
|
||||||
|
assert Metaspace(add_prefix_space=True) is not None
|
||||||
|
assert isinstance(Metaspace(), PreTokenizer)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCharDelimiterSplit:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert CharDelimiterSplit("-") is not None
|
||||||
|
with pytest.raises(Exception, match="delimiter must be a single character"):
|
||||||
|
CharDelimiterSplit("")
|
||||||
|
assert isinstance(CharDelimiterSplit(" "), PreTokenizer)
|
||||||
62
bindings/python/tests/bindings/test_processors.py
Normal file
62
bindings/python/tests/bindings/test_processors.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
from ..utils import data_dir, roberta_files
|
||||||
|
|
||||||
|
from tokenizers import Tokenizer
|
||||||
|
from tokenizers.models import BPE
|
||||||
|
from tokenizers.pre_tokenizers import ByteLevel as ByteLevelPreTokenizer
|
||||||
|
from tokenizers.processors import PostProcessor, BertProcessing, RobertaProcessing, ByteLevel
|
||||||
|
|
||||||
|
|
||||||
|
class TestBertProcessing:
|
||||||
|
def test_instantiate(self):
|
||||||
|
processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
|
||||||
|
assert processor is not None
|
||||||
|
assert isinstance(processor, PostProcessor)
|
||||||
|
|
||||||
|
def test_processing(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
|
||||||
|
|
||||||
|
output = tokenizer.encode("my name", "pair")
|
||||||
|
assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"]
|
||||||
|
assert output.ids == [1, 2, 3, 0, 6, 0]
|
||||||
|
|
||||||
|
|
||||||
|
class TestRobertaProcessing:
|
||||||
|
def test_instantiate(self):
|
||||||
|
processor = RobertaProcessing(("</s>", 1), ("<s>", 0))
|
||||||
|
assert processor is not None
|
||||||
|
assert isinstance(processor, PostProcessor)
|
||||||
|
|
||||||
|
def test_processing(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_special_tokens(["<s>", "</s>"])
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))
|
||||||
|
|
||||||
|
output = tokenizer.encode("my name", "pair")
|
||||||
|
assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"]
|
||||||
|
assert output.ids == [0, 2, 3, 1, 1, 6, 1]
|
||||||
|
|
||||||
|
|
||||||
|
class TestByteLevelProcessing:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert ByteLevel() is not None
|
||||||
|
assert ByteLevel(trim_offsets=True) is not None
|
||||||
|
assert isinstance(ByteLevel(), PostProcessor)
|
||||||
|
|
||||||
|
def test_processing(self, roberta_files):
|
||||||
|
tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
|
||||||
|
tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True)
|
||||||
|
|
||||||
|
# Keeps original offsets
|
||||||
|
output = tokenizer.encode("My name is John")
|
||||||
|
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
||||||
|
assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
|
||||||
|
|
||||||
|
# Trims offsets when activated
|
||||||
|
tokenizer.post_processor = ByteLevel(trim_offsets=True)
|
||||||
|
output = tokenizer.encode("My name is John")
|
||||||
|
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
||||||
|
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
|
||||||
221
bindings/python/tests/bindings/test_tokenizer.py
Normal file
221
bindings/python/tests/bindings/test_tokenizer.py
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
from ..utils import data_dir, roberta_files
|
||||||
|
|
||||||
|
from tokenizers import AddedToken, Tokenizer
|
||||||
|
from tokenizers.models import Model, BPE
|
||||||
|
from tokenizers.pre_tokenizers import ByteLevel
|
||||||
|
from tokenizers.processors import RobertaProcessing
|
||||||
|
from tokenizers.normalizers import Lowercase
|
||||||
|
|
||||||
|
|
||||||
|
class TestAddedToken:
|
||||||
|
def test_instantiate_with_content_only(self):
|
||||||
|
added_token = AddedToken("<mask>")
|
||||||
|
assert type(added_token) == AddedToken
|
||||||
|
assert str(added_token) == "<mask>"
|
||||||
|
assert (
|
||||||
|
repr(added_token)
|
||||||
|
== 'AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False)'
|
||||||
|
)
|
||||||
|
assert added_token.rstrip == False
|
||||||
|
assert added_token.lstrip == False
|
||||||
|
assert added_token.single_word == False
|
||||||
|
|
||||||
|
def test_can_set_rstrip(self):
|
||||||
|
added_token = AddedToken("<mask>", rstrip=True)
|
||||||
|
assert added_token.rstrip == True
|
||||||
|
assert added_token.lstrip == False
|
||||||
|
assert added_token.single_word == False
|
||||||
|
|
||||||
|
def test_can_set_lstrip(self):
|
||||||
|
added_token = AddedToken("<mask>", lstrip=True)
|
||||||
|
assert added_token.rstrip == False
|
||||||
|
assert added_token.lstrip == True
|
||||||
|
assert added_token.single_word == False
|
||||||
|
|
||||||
|
def test_can_set_single_world(self):
|
||||||
|
added_token = AddedToken("<mask>", single_word=True)
|
||||||
|
assert added_token.rstrip == False
|
||||||
|
assert added_token.lstrip == False
|
||||||
|
assert added_token.single_word == True
|
||||||
|
|
||||||
|
|
||||||
|
class TestTokenizer:
|
||||||
|
def test_has_expected_type_and_methods(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
assert type(tokenizer) == Tokenizer
|
||||||
|
assert callable(tokenizer.num_special_tokens_to_add)
|
||||||
|
assert callable(tokenizer.get_vocab)
|
||||||
|
assert callable(tokenizer.get_vocab_size)
|
||||||
|
assert callable(tokenizer.enable_truncation)
|
||||||
|
assert callable(tokenizer.no_truncation)
|
||||||
|
assert callable(tokenizer.enable_padding)
|
||||||
|
assert callable(tokenizer.no_padding)
|
||||||
|
assert callable(tokenizer.normalize)
|
||||||
|
assert callable(tokenizer.encode)
|
||||||
|
assert callable(tokenizer.encode_batch)
|
||||||
|
assert callable(tokenizer.decode)
|
||||||
|
assert callable(tokenizer.decode_batch)
|
||||||
|
assert callable(tokenizer.token_to_id)
|
||||||
|
assert callable(tokenizer.id_to_token)
|
||||||
|
assert callable(tokenizer.add_tokens)
|
||||||
|
assert callable(tokenizer.add_special_tokens)
|
||||||
|
assert callable(tokenizer.train)
|
||||||
|
assert callable(tokenizer.post_process)
|
||||||
|
assert isinstance(tokenizer.model, Model)
|
||||||
|
assert tokenizer.normalizer is None
|
||||||
|
assert tokenizer.pre_tokenizer is None
|
||||||
|
assert tokenizer.post_processor is None
|
||||||
|
assert tokenizer.decoder is None
|
||||||
|
|
||||||
|
def test_add_tokens(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
added = tokenizer.add_tokens(["my", "name", "is", "john"])
|
||||||
|
assert added == 4
|
||||||
|
|
||||||
|
added = tokenizer.add_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)])
|
||||||
|
assert added == 2
|
||||||
|
|
||||||
|
def test_add_special_tokens(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
|
||||||
|
# Can add special tokens as `str`
|
||||||
|
added = tokenizer.add_special_tokens(["my", "name", "is", "john"])
|
||||||
|
assert added == 4
|
||||||
|
|
||||||
|
# Can add special tokens as `AddedToken`
|
||||||
|
added = tokenizer.add_special_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)])
|
||||||
|
assert added == 2
|
||||||
|
|
||||||
|
def test_encode(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
|
||||||
|
# Can encode single sequence
|
||||||
|
output = tokenizer.encode("my name is john")
|
||||||
|
assert output.tokens == ["my", "name", "is", "john"]
|
||||||
|
assert type(output.ids) == list
|
||||||
|
assert type(output.type_ids) == list
|
||||||
|
assert type(output.offsets) == list
|
||||||
|
assert type(output.words) == list
|
||||||
|
assert type(output.special_tokens_mask) == list
|
||||||
|
assert type(output.attention_mask) == list
|
||||||
|
assert type(output.overflowing) == list
|
||||||
|
|
||||||
|
# Can encode a pair of sequences
|
||||||
|
output = tokenizer.encode("my name is john", "pair")
|
||||||
|
assert output.tokens == ["my", "name", "is", "john", "pair"]
|
||||||
|
|
||||||
|
# Can encode a batch with both a single sequence and a pair of sequences
|
||||||
|
output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
|
||||||
|
assert len(output) == 2
|
||||||
|
|
||||||
|
def test_encode_add_special_tokens(self, roberta_files):
|
||||||
|
tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
|
||||||
|
tokenizer.add_special_tokens(["<s>", "</s>"])
|
||||||
|
|
||||||
|
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
|
||||||
|
tokenizer.post_processor = RobertaProcessing(
|
||||||
|
("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Can encode with special tokens
|
||||||
|
output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
|
||||||
|
assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"]
|
||||||
|
|
||||||
|
# Can encode without special tokens
|
||||||
|
output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
|
||||||
|
assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
||||||
|
|
||||||
|
def test_truncation(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
tokenizer.enable_truncation(2)
|
||||||
|
|
||||||
|
# Can truncate single sequences
|
||||||
|
output = tokenizer.encode("my name is john")
|
||||||
|
assert output.tokens == ["my", "name"]
|
||||||
|
|
||||||
|
# Can truncate pair sequences as well
|
||||||
|
output = tokenizer.encode("my name is john", "pair")
|
||||||
|
assert output.tokens == ["my", "pair"]
|
||||||
|
|
||||||
|
def test_padding(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
|
||||||
|
# By default it does nothing when encoding single sequence
|
||||||
|
tokenizer.enable_padding()
|
||||||
|
output = tokenizer.encode("my name")
|
||||||
|
assert output.tokens == ["my", "name"]
|
||||||
|
|
||||||
|
# Can pad to the longest in a batch
|
||||||
|
output = tokenizer.encode_batch(["my name", "my name is john"])
|
||||||
|
assert all([len(encoding) == 4 for encoding in output])
|
||||||
|
|
||||||
|
# Can pad to the specified max length otherwise
|
||||||
|
tokenizer.enable_padding(max_length=4)
|
||||||
|
output = tokenizer.encode("my name")
|
||||||
|
assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
|
||||||
|
output = tokenizer.encode("my name", "pair")
|
||||||
|
assert output.tokens == ["my", "name", "pair", "[PAD]"]
|
||||||
|
|
||||||
|
def test_decode(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
|
||||||
|
# Can decode single sequences
|
||||||
|
output = tokenizer.decode([0, 1, 2, 3])
|
||||||
|
assert output == "my name is john"
|
||||||
|
|
||||||
|
# Can decode batch
|
||||||
|
output = tokenizer.decode_batch([[0, 1, 2, 3], [4]])
|
||||||
|
assert output == ["my name is john", "pair"]
|
||||||
|
|
||||||
|
def test_get_vocab(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
|
||||||
|
# Can retrieve vocab with added tokens
|
||||||
|
vocab = tokenizer.get_vocab(with_added_tokens=True)
|
||||||
|
assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4}
|
||||||
|
|
||||||
|
# Can retrieve vocab without added tokens
|
||||||
|
vocab = tokenizer.get_vocab(with_added_tokens=False)
|
||||||
|
assert vocab == {}
|
||||||
|
|
||||||
|
def test_get_vocab_size(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
|
||||||
|
# Can retrieve vocab's size with added tokens
|
||||||
|
size = tokenizer.get_vocab_size(with_added_tokens=True)
|
||||||
|
assert size == 5
|
||||||
|
|
||||||
|
# Can retrieve vocab's size without added tokens
|
||||||
|
size = tokenizer.get_vocab_size(with_added_tokens=False)
|
||||||
|
assert size == 0
|
||||||
|
|
||||||
|
def test_normalize(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
tokenizer.normalizer = Lowercase()
|
||||||
|
|
||||||
|
output = tokenizer.normalize("My Name Is John")
|
||||||
|
assert output == "my name is john"
|
||||||
|
|
||||||
|
def test_post_process(self):
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||||
|
tokenizer.enable_truncation(2)
|
||||||
|
tokenizer.enable_padding(max_length=4)
|
||||||
|
|
||||||
|
encoding = tokenizer.encode("my name is john")
|
||||||
|
pair_encoding = tokenizer.encode("pair")
|
||||||
|
|
||||||
|
# Can post process a single encoding
|
||||||
|
output = tokenizer.post_process(encoding)
|
||||||
|
assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
|
||||||
|
|
||||||
|
# Can post process a pair of encodings
|
||||||
|
output = tokenizer.post_process(encoding, pair_encoding)
|
||||||
|
assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
|
||||||
0
bindings/python/tests/implementations/__init__.py
Normal file
0
bindings/python/tests/implementations/__init__.py
Normal file
21
bindings/python/tests/implementations/test_bert_wordpiece.py
Normal file
21
bindings/python/tests/implementations/test_bert_wordpiece.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from ..utils import data_dir, bert_files
|
||||||
|
from tokenizers import BertWordPieceTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class TestBertWordPieceBPE:
|
||||||
|
def test_basic_encode(self, bert_files):
|
||||||
|
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
|
||||||
|
|
||||||
|
# Encode with special tokens by default
|
||||||
|
output = tokenizer.encode("My name is John", "pair")
|
||||||
|
assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
|
||||||
|
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
|
||||||
|
assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
|
||||||
|
assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
|
||||||
|
|
||||||
|
# Can encode without the special tokens
|
||||||
|
output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
|
||||||
|
assert output.ids == [2026, 2171, 2003, 2198, 3940]
|
||||||
|
assert output.tokens == ["my", "name", "is", "john", "pair"]
|
||||||
|
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
|
||||||
|
assert output.type_ids == [0, 0, 0, 0, 1]
|
||||||
81
bindings/python/tests/implementations/test_byte_level_bpe.py
Normal file
81
bindings/python/tests/implementations/test_byte_level_bpe.py
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
from ..utils import data_dir, roberta_files
|
||||||
|
from tokenizers import ByteLevelBPETokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class TestByteLevelBPE:
|
||||||
|
def test_basic_encode(self, roberta_files):
|
||||||
|
tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
|
||||||
|
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
||||||
|
|
||||||
|
assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
|
||||||
|
assert output.tokens == [
|
||||||
|
"The",
|
||||||
|
"Ġquick",
|
||||||
|
"Ġbrown",
|
||||||
|
"Ġfox",
|
||||||
|
"Ġjumps",
|
||||||
|
"Ġover",
|
||||||
|
"Ġthe",
|
||||||
|
"Ġlazy",
|
||||||
|
"Ġdog",
|
||||||
|
]
|
||||||
|
assert output.offsets == [
|
||||||
|
(0, 3),
|
||||||
|
(3, 9),
|
||||||
|
(9, 15),
|
||||||
|
(15, 19),
|
||||||
|
(19, 25),
|
||||||
|
(25, 30),
|
||||||
|
(30, 34),
|
||||||
|
(34, 39),
|
||||||
|
(39, 43),
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_add_prefix_space(self, roberta_files):
|
||||||
|
tokenizer = ByteLevelBPETokenizer(
|
||||||
|
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
|
||||||
|
)
|
||||||
|
output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
|
||||||
|
|
||||||
|
assert output.ids == [20, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
|
||||||
|
assert output.tokens == [
|
||||||
|
"ĠThe",
|
||||||
|
"Ġquick",
|
||||||
|
"Ġbrown",
|
||||||
|
"Ġfox",
|
||||||
|
"Ġjumps",
|
||||||
|
"Ġover",
|
||||||
|
"Ġthe",
|
||||||
|
"Ġlazy",
|
||||||
|
"Ġdog",
|
||||||
|
]
|
||||||
|
assert output.offsets == [
|
||||||
|
(0, 3),
|
||||||
|
(3, 9),
|
||||||
|
(9, 15),
|
||||||
|
(15, 19),
|
||||||
|
(19, 25),
|
||||||
|
(25, 30),
|
||||||
|
(30, 34),
|
||||||
|
(34, 39),
|
||||||
|
(39, 43),
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_lowerspace(self, roberta_files):
|
||||||
|
tokenizer = ByteLevelBPETokenizer(
|
||||||
|
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
|
||||||
|
)
|
||||||
|
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
|
||||||
|
|
||||||
|
assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
|
||||||
|
assert output.tokens == [
|
||||||
|
"Ġthe",
|
||||||
|
"Ġquick",
|
||||||
|
"Ġbrown",
|
||||||
|
"Ġfox",
|
||||||
|
"Ġjumps",
|
||||||
|
"Ġover",
|
||||||
|
"Ġthe",
|
||||||
|
"Ġlazy",
|
||||||
|
"Ġdog",
|
||||||
|
]
|
||||||
44
bindings/python/tests/implementations/test_char_bpe.py
Normal file
44
bindings/python/tests/implementations/test_char_bpe.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
from ..utils import data_dir, openai_files
|
||||||
|
from tokenizers import CharBPETokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class TestBertWordPieceBPE:
|
||||||
|
def test_basic_encode(self, openai_files):
|
||||||
|
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
|
||||||
|
|
||||||
|
output = tokenizer.encode("My name is John", "pair")
|
||||||
|
assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
|
||||||
|
assert output.tokens == [
|
||||||
|
"<unk>",
|
||||||
|
"y</w>",
|
||||||
|
"name</w>",
|
||||||
|
"is</w>",
|
||||||
|
"<unk>",
|
||||||
|
"o",
|
||||||
|
"hn</w>",
|
||||||
|
"pair</w>",
|
||||||
|
]
|
||||||
|
assert output.offsets == [
|
||||||
|
(0, 1),
|
||||||
|
(1, 2),
|
||||||
|
(3, 7),
|
||||||
|
(8, 10),
|
||||||
|
(11, 12),
|
||||||
|
(12, 13),
|
||||||
|
(13, 15),
|
||||||
|
(0, 4),
|
||||||
|
]
|
||||||
|
assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
|
||||||
|
|
||||||
|
def test_lowercase(self, openai_files):
|
||||||
|
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
|
||||||
|
output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
|
||||||
|
assert output.ids == [547, 1362, 544, 2476, 2688]
|
||||||
|
assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
|
||||||
|
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
|
||||||
|
assert output.type_ids == [0, 0, 0, 0, 1]
|
||||||
|
|
||||||
|
def test_decoding(self, openai_files):
|
||||||
|
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
|
||||||
|
decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
|
||||||
|
assert decoded == "my name is john"
|
||||||
58
bindings/python/tests/utils.py
Normal file
58
bindings/python/tests/utils.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
import os
|
||||||
|
import requests
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
DATA_PATH = os.path.join("tests", "data")
|
||||||
|
|
||||||
|
|
||||||
|
def download(url):
|
||||||
|
filename = url.rsplit("/")[-1]
|
||||||
|
filepath = os.path.join(DATA_PATH, filename)
|
||||||
|
if not os.path.exists(filepath):
|
||||||
|
with open(filepath, "wb") as f:
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
response.raise_for_status()
|
||||||
|
for chunk in response.iter_content(1024):
|
||||||
|
f.write(chunk)
|
||||||
|
return filepath
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def data_dir():
|
||||||
|
assert os.getcwd().endswith("python")
|
||||||
|
exist = os.path.exists(DATA_PATH) and os.path.isdir(DATA_PATH)
|
||||||
|
if not exist:
|
||||||
|
os.mkdir(DATA_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def roberta_files(data_dir):
|
||||||
|
return {
|
||||||
|
"vocab": download(
|
||||||
|
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
|
||||||
|
),
|
||||||
|
"merges": download(
|
||||||
|
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def bert_files(data_dir):
|
||||||
|
return {
|
||||||
|
"vocab": download(
|
||||||
|
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def openai_files(data_dir):
|
||||||
|
return {
|
||||||
|
"vocab": download(
|
||||||
|
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"
|
||||||
|
),
|
||||||
|
"merges": download(
|
||||||
|
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
|
||||||
|
),
|
||||||
|
}
|
||||||
@@ -158,3 +158,7 @@ class WordLevel(Model):
|
|||||||
The unknown token to be used by the model.
|
The unknown token to be used by the model.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
@staticmethod
|
||||||
|
def empty() -> Model:
|
||||||
|
""" Instantiate an empty WordLevel Model. """
|
||||||
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user