Updating python formatting. (#1079)

* Updating python formatting.

* Forgot gh action.

* Skipping isort to prevent circular imports.

* Updating stub.

* Removing `isort` (it contradicts `stub.py`).

* Fixing weird stub black/isort disagreeement.
This commit is contained in:
Nicolas Patry
2022-10-05 15:29:33 +02:00
committed by GitHub
parent 5f6e978452
commit 6113666624
43 changed files with 280 additions and 306 deletions

View File

@ -102,13 +102,12 @@ jobs:
source .env/bin/activate
pip install -U pip
pip install pytest requests setuptools_rust numpy pyarrow datasets
python setup.py develop
pip install -e .[dev]
- name: Check style
working-directory: ./bindings/python
run: |
source .env/bin/activate
pip install black==20.8b1 click==8.0.4
make check-style
- name: Run tests

View File

@ -3,16 +3,17 @@
DATA_DIR = data
dir_guard=@mkdir -p $(@D)
check_dirs := examples py_src/tokenizers tests
# Format source code automatically
style:
python stub.py
black --line-length 100 --target-version py36 examples py_src/tokenizers tests
black --line-length 119 --target-version py35 $(check_dirs)
# Check the source code is formatted correctly
check-style:
python stub.py --check
black --check --line-length 100 --target-version py36 examples py_src/tokenizers tests
black --check --line-length 119 --target-version py35 examples py_src/tokenizers tests
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

View File

@ -1,12 +1,11 @@
import jieba
from typing import List
from tokenizers import Tokenizer, Regex, NormalizedString, PreTokenizedString
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.normalizers import Normalizer
import jieba
from tokenizers import NormalizedString, PreTokenizedString, Regex, Tokenizer
from tokenizers.decoders import Decoder
from tokenizers.models import BPE
from tokenizers.normalizers import Normalizer
from tokenizers.pre_tokenizers import PreTokenizer
class JiebaPreTokenizer:
@ -21,9 +20,7 @@ class JiebaPreTokenizer:
# We can also easily do it in one line:
# return [normalized_string[w[1] : w[2]] for w in jieba.tokenize(str(normalized_string))]
def odd_number_split(
self, i: int, normalized_string: NormalizedString
) -> List[NormalizedString]:
def odd_number_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
# Just an odd example...
splits = []
last = 0

View File

@ -1,18 +1,19 @@
import time
import argparse
import logging
import time
from tqdm import tqdm
import logging
logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers import Tokenizer, decoders, pre_tokenizers
from tokenizers.models import BPE, WordPiece
from tokenizers.processors import BertProcessing
from tokenizers.normalizers import BertNormalizer
from tokenizers.processors import BertProcessing
from transformers import BertTokenizer, GPT2Tokenizer
from transformers import GPT2Tokenizer, BertTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")

View File

@ -3,6 +3,7 @@ import glob
from tokenizers import BertWordPieceTokenizer
parser = argparse.ArgumentParser()
parser.add_argument(
"--files",
@ -19,9 +20,7 @@ parser.add_argument(
type=str,
help="Path to the output directory, where the files will be saved",
)
parser.add_argument(
"--name", default="bert-wordpiece", type=str, help="The name of the output vocab files"
)
parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files")
args = parser.parse_args()
files = glob.glob(args.files)

View File

@ -4,6 +4,7 @@ from os.path import join
from tokenizers import ByteLevelBPETokenizer
parser = argparse.ArgumentParser()
parser.add_argument(
"--files",
@ -20,9 +21,7 @@ parser.add_argument(
type=str,
help="Path to the output directory, where the files will be saved",
)
parser.add_argument(
"--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files"
)
parser.add_argument("--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files")
args = parser.parse_args()
files = glob.glob(args.files)

View File

@ -1,5 +1,7 @@
import datasets
from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
# Build a tokenizer
bpe_tokenizer = Tokenizer(models.BPE())

View File

@ -1,7 +1,8 @@
__version__ = "0.13.1.dev0"
from typing import Tuple, Union, Tuple, List
from enum import Enum
from typing import List, Tuple, Union
Offsets = Tuple[int, int]
@ -77,25 +78,24 @@ class SplitDelimiterBehavior(Enum):
from .tokenizers import (
Tokenizer,
Encoding,
AddedToken,
Regex,
Encoding,
NormalizedString,
PreTokenizedString,
Regex,
Token,
Tokenizer,
decoders,
models,
normalizers,
pre_tokenizers,
processors,
trainers,
)
from .tokenizers import decoders
from .tokenizers import models
from .tokenizers import normalizers
from .tokenizers import pre_tokenizers
from .tokenizers import processors
from .tokenizers import trainers
from .implementations import (
BertWordPieceTokenizer,
ByteLevelBPETokenizer,
CharBPETokenizer,
SentencePieceBPETokenizer,
SentencePieceUnigramTokenizer,
BertWordPieceTokenizer,
)

View File

@ -709,13 +709,7 @@ class Tokenizer:
"""
pass
def enable_padding(
self,
direction="right",
pad_id=0,
pad_type_id=0,
pad_token="[PAD]",
length=None,
pad_to_multiple_of=None,
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
):
"""
Enable the padding

View File

@ -1,5 +1,6 @@
from .. import decoders
Decoder = decoders.Decoder
ByteLevel = decoders.ByteLevel
WordPiece = decoders.WordPiece

View File

@ -1,6 +1,6 @@
from .base_tokenizer import BaseTokenizer
from .bert_wordpiece import BertWordPieceTokenizer
from .byte_level_bpe import ByteLevelBPETokenizer
from .char_level_bpe import CharBPETokenizer
from .sentencepiece_bpe import SentencePieceBPETokenizer
from .sentencepiece_unigram import SentencePieceUnigramTokenizer
from .bert_wordpiece import BertWordPieceTokenizer

View File

@ -1,11 +1,12 @@
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
from typing import Dict, List, Optional, Tuple, Union
from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer
from tokenizers.decoders import Decoder
from tokenizers.models import Model
from tokenizers.normalizers import Normalizer
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.processors import PostProcessor
from tokenizers.decoders import Decoder
from typing import List, Union, Tuple, Optional, Dict
Offsets = Tuple[int, int]
@ -109,9 +110,7 @@ class BaseTokenizer:
"""
return self._tokenizer.padding
def enable_truncation(
self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
):
def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"):
"""Change the truncation options
Args:
@ -270,9 +269,7 @@ class BaseTokenizer:
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
def decode_batch(
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
) -> str:
def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str:
"""Decode the list of sequences to a list of string sequences
Args:

View File

@ -1,11 +1,12 @@
from tokenizers import Tokenizer, AddedToken, decoders, trainers
from typing import Dict, Iterator, List, Optional, Union
from tokenizers import AddedToken, Tokenizer, decoders, trainers
from tokenizers.models import WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.pre_tokenizers import BertPreTokenizer
from tokenizers.processors import BertProcessing
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union, Dict, Iterator
from .base_tokenizer import BaseTokenizer
class BertWordPieceTokenizer(BaseTokenizer):
@ -59,9 +60,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
if cls_token_id is None:
raise TypeError("cls_token not found in the vocabulary")
tokenizer.post_processor = BertProcessing(
(str(sep_token), sep_token_id), (str(cls_token), cls_token_id)
)
tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
parameters = {

View File

@ -1,16 +1,10 @@
from tokenizers import (
Tokenizer,
AddedToken,
pre_tokenizers,
decoders,
trainers,
processors,
)
from tokenizers.models import BPE
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
from .base_tokenizer import BaseTokenizer
from typing import Dict, Iterator, List, Optional, Tuple, Union
from typing import Optional, List, Union, Dict, Tuple, Iterator
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
from .base_tokenizer import BaseTokenizer
class ByteLevelBPETokenizer(BaseTokenizer):

View File

@ -1,14 +1,9 @@
from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
from ..models import BPE
from ..normalizers import (
Sequence,
Lowercase,
unicode_normalizer_from_str,
BertNormalizer,
)
from .base_tokenizer import BaseTokenizer
from typing import Dict, Iterator, List, Optional, Tuple, Union
from typing import Optional, List, Union, Dict, Tuple, Iterator
from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
from ..models import BPE
from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
from .base_tokenizer import BaseTokenizer
class CharBPETokenizer(BaseTokenizer):

View File

@ -1,9 +1,10 @@
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
from typing import Dict, Iterator, List, Optional, Tuple, Union
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union, Dict, Tuple, Iterator
from .base_tokenizer import BaseTokenizer
class SentencePieceBPETokenizer(BaseTokenizer):
@ -23,9 +24,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
fuse_unk: Optional[bool] = False,
):
if vocab is not None and merges is not None:
tokenizer = Tokenizer(
BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk)
)
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
else:
tokenizer = Tokenizer(BPE())
@ -33,12 +32,8 @@ class SentencePieceBPETokenizer(BaseTokenizer):
tokenizer.add_special_tokens([str(unk_token)])
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
parameters = {
"model": "SentencePieceBPE",

View File

@ -1,10 +1,11 @@
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, normalizers, Regex
import os
from tokenizers.models import Unigram
import json
from .base_tokenizer import BaseTokenizer
import os
from typing import Iterator, List, Optional, Union
from typing import Optional, List, Union, Iterator
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
from tokenizers.models import Unigram
from .base_tokenizer import BaseTokenizer
class SentencePieceUnigramTokenizer(BaseTokenizer):
@ -28,12 +29,8 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
tokenizer.normalizer = normalizers.Sequence(
[normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
parameters = {
"model": "SentencePieceUnigram",
@ -181,12 +178,8 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
normalizers.Replace(Regex(" {2,}"), " "),
]
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
parameters = {
"model": "SentencePieceUnigram",

View File

@ -1,5 +1,6 @@
from .. import normalizers
Normalizer = normalizers.Normalizer
BertNormalizer = normalizers.BertNormalizer
NFD = normalizers.NFD
@ -21,9 +22,7 @@ NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
if normalizer not in NORMALIZERS:
raise ValueError(
"{} is not a known unicode normalizer. Available are {}".format(
normalizer, NORMALIZERS.keys()
)
"{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
)
return NORMALIZERS[normalizer]()

View File

@ -63,9 +63,7 @@ class BertNormalizer(Normalizer):
Whether to lowercase.
"""
def __init__(
self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True
):
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
pass
def normalize(self, normalized):
"""

View File

@ -1 +1 @@
from .visualizer import EncodingVisualizer, Annotation
from .visualizer import Annotation, EncodingVisualizer

View File

@ -1,11 +1,11 @@
import os
import itertools
import os
import re
from typing import List, Optional, Tuple, Dict, Callable, Any, NamedTuple
from string import Template
from typing import List
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
from tokenizers import Encoding, Tokenizer
from tokenizers import Tokenizer, Encoding
dirname = os.path.dirname(__file__)
css_filename = os.path.join(dirname, "visualizer-styles.css")
@ -91,7 +91,7 @@ class EncodingVisualizer:
):
if default_to_notebook:
try:
from IPython.core.display import display, HTML
from IPython.core.display import HTML, display
except ImportError as e:
raise Exception(
"""We couldn't import IPython utils for html display.
@ -135,7 +135,7 @@ class EncodingVisualizer:
final_default_to_notebook = default_to_notebook
if final_default_to_notebook:
try:
from IPython.core.display import display, HTML
from IPython.core.display import HTML, display
except ImportError as e:
raise Exception(
"""We couldn't import IPython utils for html display.
@ -174,9 +174,7 @@ class EncodingVisualizer:
h = 10
colors = {}
for label in sorted(
labels
): # sort so we always get the same colors for a given set of labels
for label in sorted(labels): # sort so we always get the same colors for a given set of labels
colors[label] = f"hsl({h},{s}%,{l}%"
h += h_step
return colors
@ -234,10 +232,7 @@ class EncodingVisualizer:
else:
# Like above, but a different color so we can see the tokens alternate
css_classes.append("even-token")
if (
EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix])
is not None
):
if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
# This is a special token that is in the text. probably UNK
css_classes.append("special-token")
# TODO is this the right name for the data attribute ?
@ -289,9 +284,7 @@ class EncodingVisualizer:
anno = annotations[cur_anno_ix]
label = anno.label
color = label_colors_dict[label]
spans.append(
f'<span class="annotation" style="color:{color}" data-label="{label}">'
)
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
prev_anno_ix = cur_anno_ix
if cs.partition_key() == current_consecutive_chars[0].partition_key():
@ -342,9 +335,7 @@ class EncodingVisualizer:
return annotation_map
@staticmethod
def __make_char_states(
text: str, encoding: Encoding, annotations: AnnotationList
) -> List[CharState]:
def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
"""
For each character in the original text, we emit a tuple representing it's "state":

View File

@ -3,5 +3,5 @@ requires = ["setuptools", "wheel", "setuptools-rust"]
build-backend = "setuptools.build_meta"
[tool.black]
target-version = ['py36']
line-length = 100
target-version = ['py35']
line-length = 119

54
bindings/python/setup.cfg Normal file
View File

@ -0,0 +1,54 @@
[isort]
default_section = FIRSTPARTY
ensure_newline_before_comments = True
force_grid_wrap = 0
include_trailing_comma = True
known_first_party = transformers
known_third_party =
absl
conllu
datasets
elasticsearch
fairseq
faiss-cpu
fastprogress
fire
fugashi
git
h5py
matplotlib
nltk
numpy
packaging
pandas
PIL
psutil
pytest
pytorch_lightning
rouge_score
sacrebleu
seqeval
sklearn
streamlit
tensorboardX
tensorflow
tensorflow_datasets
timeout_decorator
torch
torchaudio
torchtext
torchvision
torch_xla
tqdm
line_length = 119
lines_after_imports = 2
multi_line_output = 3
use_parentheses = True
[flake8]
ignore = E203, E501, E741, W503, W605
max-line-length = 119
[tool:pytest]
doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS

View File

@ -1,8 +1,9 @@
from setuptools import setup
from setuptools_rust import Binding, RustExtension
extras = {}
extras["testing"] = ["pytest", "requests", "numpy", "datasets"]
extras["testing"] = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
extras["docs"] = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
extras["dev"] = extras["testing"]

View File

@ -1,9 +1,11 @@
import argparse
import inspect
import os
import argparse
import black
from pathlib import Path
import black
INDENT = " " * 4
GENERATED_COMMENT = "# Generated content DO NOT EDIT\n"
@ -122,8 +124,8 @@ def py_file(module, origin):
def do_black(content, is_pyi):
mode = black.Mode(
target_versions={black.TargetVersion.PY36},
line_length=100,
target_versions={black.TargetVersion.PY35},
line_length=119,
is_pyi=is_pyi,
string_normalization=True,
experimental_string_processing=False,
@ -135,9 +137,7 @@ def do_black(content, is_pyi):
def write(module, directory, origin, check=False):
submodules = [
(name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member)
]
submodules = [(name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member)]
filename = os.path.join(directory, "__init__.pyi")
pyi_content = pyi_file(module)
@ -146,9 +146,7 @@ def write(module, directory, origin, check=False):
if check:
with open(filename, "r") as f:
data = f.read()
assert (
data == pyi_content
), f"The content of {filename} seems outdated, please run `python stub.py`"
assert data == pyi_content, f"The content of {filename} seems outdated, please run `python stub.py`"
else:
with open(filename, "w") as f:
f.write(pyi_content)
@ -171,9 +169,7 @@ def write(module, directory, origin, check=False):
if check:
with open(filename, "r") as f:
data = f.read()
assert (
data == py_content
), f"The content of {filename} seems outdated, please run `python stub.py`"
assert data == py_content, f"The content of {filename} seems outdated, please run `python stub.py`"
else:
with open(filename, "w") as f:
f.write(py_content)

View File

@ -1,8 +1,9 @@
import pytest
import pickle
import json
import pickle
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder, CTC, Sequence
import pytest
from tokenizers.decoders import CTC, BPEDecoder, ByteLevel, Decoder, Metaspace, Sequence, WordPiece
class TestByteLevel:
@ -93,10 +94,7 @@ class TestBPEDecoder:
def test_decoding(self):
decoder = BPEDecoder()
assert (
decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"])
== "My name is John"
)
assert decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"]) == "My name is John"
decoder = BPEDecoder(suffix="_")
assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == "My name is John"
@ -121,16 +119,12 @@ class TestCTCDecoder:
def test_decoding(self):
decoder = CTC()
assert (
decoder.decode(
["<pad>", "<pad>", "h", "e", "e", "l", "l", "<pad>", "l", "o", "o", "o", "<pad>"]
)
decoder.decode(["<pad>", "<pad>", "h", "e", "e", "l", "l", "<pad>", "l", "o", "o", "o", "<pad>"])
== "hello"
)
decoder = CTC(pad_token="[PAD]")
assert (
decoder.decode(
["[PAD]", "[PAD]", "h", "e", "e", "l", "l", "[PAD]", "l", "o", "o", "o", "[PAD]"]
)
decoder.decode(["[PAD]", "[PAD]", "h", "e", "e", "l", "l", "[PAD]", "l", "o", "o", "o", "[PAD]"])
== "hello"
)

View File

@ -1,8 +1,9 @@
import pytest
from ..utils import data_dir, bert_files
from tokenizers import BertWordPieceTokenizer
from ..utils import bert_files, data_dir
class TestEncoding:
@pytest.fixture(scope="class")

View File

@ -1,9 +1,10 @@
import pytest
import pickle
from ..utils import data_dir, roberta_files, bert_files
import pytest
from tokenizers.models import Model, BPE, WordPiece, WordLevel
from tokenizers.models import BPE, Model, WordLevel, WordPiece
from ..utils import bert_files, data_dir, roberta_files
class TestBPE:

View File

@ -1,9 +1,10 @@
import pickle
import pytest
from tokenizers import Tokenizer, NormalizedString
from tokenizers import NormalizedString, Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import Normalizer, BertNormalizer, Sequence, Lowercase, Strip
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip
class TestBertNormalizer:
@ -13,41 +14,31 @@ class TestBertNormalizer:
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
def test_strip_accents(self):
normalizer = BertNormalizer(
strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
)
normalizer = BertNormalizer(strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False)
output = normalizer.normalize_str("Héllò")
assert output == "Hello"
def test_handle_chinese_chars(self):
normalizer = BertNormalizer(
strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False
)
normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False)
output = normalizer.normalize_str("你好")
assert output == " 你 好 "
def test_clean_text(self):
normalizer = BertNormalizer(
strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True
)
normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True)
output = normalizer.normalize_str("\ufeffHello")
assert output == "Hello"
def test_lowercase(self):
normalizer = BertNormalizer(
strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False
)
normalizer = BertNormalizer(strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False)
output = normalizer.normalize_str("Héllò")
assert output == "héllò"
def test_can_modify(self):
normalizer = BertNormalizer(
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True
)
normalizer = BertNormalizer(clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True)
assert normalizer.clean_text == True
assert normalizer.handle_chinese_chars == True
@ -151,9 +142,7 @@ class TestCustomNormalizer:
with pytest.raises(Exception, match="TypeError:.*normalize()"):
bad.normalize_str("Hey there!")
assert good.normalize_str("Hey there!") == "Hey you!"
with pytest.raises(
Exception, match="Cannot use a NormalizedStringRefMut outside `normalize`"
):
with pytest.raises(Exception, match="Cannot use a NormalizedStringRefMut outside `normalize`"):
good_custom.use_after_normalize()
def test_normalizer_interface(self):

View File

@ -1,20 +1,21 @@
import pytest
import pickle
import json
import pickle
import pytest
from tokenizers.pre_tokenizers import (
PreTokenizer,
ByteLevel,
Whitespace,
WhitespaceSplit,
BertPreTokenizer,
Metaspace,
ByteLevel,
CharDelimiterSplit,
Digits,
Metaspace,
PreTokenizer,
Punctuation,
Sequence,
Digits,
UnicodeScripts,
Split,
UnicodeScripts,
Whitespace,
WhitespaceSplit,
)

View File

@ -1,21 +1,22 @@
import pytest
import pickle
import json
import pickle
from ..utils import data_dir, roberta_files
import pytest
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel as ByteLevelPreTokenizer
from tokenizers.processors import (
PostProcessor,
BertProcessing,
RobertaProcessing,
ByteLevel,
TemplateProcessing,
PostProcessor,
RobertaProcessing,
Sequence,
TemplateProcessing,
)
from ..utils import data_dir, roberta_files
class TestBertProcessing:
def test_instantiate(self):

View File

@ -1,19 +1,16 @@
import numpy as np
import pickle
import pytest
from ..utils import (
data_dir,
roberta_files,
bert_files,
multiprocessing_with_parallelism,
)
from tokenizers import AddedToken, Tokenizer, Encoding
from tokenizers.models import Model, BPE, WordPiece
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import RobertaProcessing, BertProcessing
from tokenizers.normalizers import Lowercase
import numpy as np
import pytest
from tokenizers import AddedToken, Encoding, Tokenizer
from tokenizers.implementations import BertWordPieceTokenizer
from tokenizers.models import BPE, Model, WordPiece
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import BertProcessing, RobertaProcessing
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
class TestAddedToken:
@ -22,8 +19,7 @@ class TestAddedToken:
assert type(added_token) == AddedToken
assert str(added_token) == "<mask>"
assert (
repr(added_token)
== 'AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=True)'
repr(added_token) == 'AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=True)'
)
assert added_token.rstrip == False
assert added_token.lstrip == False

View File

@ -1,17 +1,19 @@
import os
import pytest
import copy
import os
import pickle
import pytest
from tokenizers import (
SentencePieceUnigramTokenizer,
AddedToken,
models,
pre_tokenizers,
normalizers,
SentencePieceUnigramTokenizer,
Tokenizer,
models,
normalizers,
pre_tokenizers,
trainers,
)
from ..utils import data_dir, train_files
@ -63,15 +65,13 @@ class TestBpeTrainer:
trainers.BpeTrainer(min_frequency=12).__getstate__()
== b"""{"BpeTrainer":{"min_frequency":12,"vocab_size":30000,"show_progress":true,"special_tokens":[],"limit_alphabet":null,"initial_alphabet":[],"continuing_subword_prefix":null,"end_of_word_suffix":null,"words":{}}}"""
)
assert isinstance(
pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12))), trainers.BpeTrainer
)
assert isinstance(pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12))), trainers.BpeTrainer)
assert isinstance(copy.deepcopy(trainers.BpeTrainer(min_frequency=12)), trainers.BpeTrainer)
# Make sure everything is correct
assert pickle.dumps(
pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12)))
) == pickle.dumps(trainers.BpeTrainer(min_frequency=12))
assert pickle.dumps(pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12)))) == pickle.dumps(
trainers.BpeTrainer(min_frequency=12)
)
class TestWordPieceTrainer:
@ -118,9 +118,7 @@ class TestWordPieceTrainer:
assert trainer.continuing_subword_prefix == None
def test_can_pickle(self):
assert isinstance(
pickle.loads(pickle.dumps(trainers.WordPieceTrainer())), trainers.WordPieceTrainer
)
assert isinstance(pickle.loads(pickle.dumps(trainers.WordPieceTrainer())), trainers.WordPieceTrainer)
class TestWordLevelTrainer:
@ -148,9 +146,7 @@ class TestWordLevelTrainer:
assert trainer.special_tokens == []
def test_can_pickle(self):
assert isinstance(
pickle.loads(pickle.dumps(trainers.WordLevelTrainer())), trainers.WordLevelTrainer
)
assert isinstance(pickle.loads(pickle.dumps(trainers.WordLevelTrainer())), trainers.WordLevelTrainer)
class TestUnigram:
@ -184,9 +180,7 @@ class TestUnigram:
bpe_tokenizer.train([train_files["small"]], trainer=trainer)
def test_can_pickle(self):
assert isinstance(
pickle.loads(pickle.dumps(trainers.UnigramTrainer())), trainers.UnigramTrainer
)
assert isinstance(pickle.loads(pickle.dumps(trainers.UnigramTrainer())), trainers.UnigramTrainer)
def test_train_with_special_tokens(self):
filename = "tests/data/dummy-unigram-special_tokens-train.txt"

View File

@ -1,6 +1,7 @@
from ..utils import data_dir, doc_wiki_tokenizer, doc_pipeline_bert_tokenizer
from tokenizers import Tokenizer
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
disable_printing = True
original_print = print
@ -112,7 +113,7 @@ class TestPipeline:
# END bert_setup_tokenizer
# START bert_setup_normalizer
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.normalizers import NFD, Lowercase, StripAccents
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
# END bert_setup_normalizer
@ -136,9 +137,7 @@ class TestPipeline:
# START bert_train_tokenizer
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(
vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
bert_tokenizer.train(files, trainer)
@ -171,9 +170,9 @@ class TestPipeline:
if __name__ == "__main__":
import os
from urllib import request
from zipfile import ZipFile
import os
disable_printing = False
if not os.path.isdir("data/wikitext-103-raw"):

View File

@ -1,8 +1,10 @@
from ..utils import data_dir, doc_wiki_tokenizer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from ..utils import data_dir, doc_wiki_tokenizer
disable_printing = True
original_print = print
@ -181,9 +183,9 @@ class TestQuicktour:
if __name__ == "__main__":
import os
from urllib import request
from zipfile import ZipFile
import os
disable_printing = False
if not os.path.isdir("data/wikitext-103-raw"):

View File

@ -1,15 +1,17 @@
from ..utils import data_dir, train_files
import os
import pytest
import datasets
import gzip
import os
import datasets
import pytest
from ..utils import data_dir, train_files
class TestTrainFromIterators:
@staticmethod
def get_tokenizer_trainer():
# START init_tokenizer_trainer
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
@ -31,9 +33,7 @@ class TestTrainFromIterators:
# START load_dataset
import datasets
dataset = datasets.load_dataset(
"wikitext", "wikitext-103-raw-v1", split="train+test+validation"
)
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation")
# END load_dataset
@pytest.fixture(scope="class")

View File

@ -1,7 +1,7 @@
import pytest
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
from tokenizers.implementations import BaseTokenizer
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, processors, decoders
class TestBaseTokenizer:

View File

@ -1,8 +1,9 @@
import pytest
from ..utils import data_dir, bert_files, multiprocessing_with_parallelism
from tokenizers import BertWordPieceTokenizer
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism
class TestBertWordPieceTokenizer:
def test_basic_encode(self, bert_files):

View File

@ -1,8 +1,9 @@
import pytest
from ..utils import data_dir, roberta_files, multiprocessing_with_parallelism
from tokenizers import ByteLevelBPETokenizer
from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files
class TestByteLevelBPE:
def test_basic_encode(self, roberta_files):

View File

@ -1,8 +1,9 @@
import pytest
from ..utils import data_dir, openai_files, multiprocessing_with_parallelism
from tokenizers import CharBPETokenizer
from ..utils import data_dir, multiprocessing_with_parallelism, openai_files
class TestCharBPETokenizer:
def test_basic_encode(self, openai_files):
@ -33,9 +34,7 @@ class TestCharBPETokenizer:
assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
def test_lowercase(self, openai_files):
tokenizer = CharBPETokenizer.from_file(
openai_files["vocab"], openai_files["merges"], lowercase=True
)
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True)
output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
assert output.ids == [547, 1362, 544, 2476, 2688]
assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
@ -43,9 +42,7 @@ class TestCharBPETokenizer:
assert output.type_ids == [0, 0, 0, 0, 1]
def test_decoding(self, openai_files):
tokenizer = CharBPETokenizer.from_file(
openai_files["vocab"], openai_files["merges"], lowercase=True
)
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True)
decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
assert decoded == "my name is john"

View File

@ -1,6 +1,6 @@
import os
import pytest
import pytest
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
@ -35,9 +35,7 @@ class TestSentencePieceUnigram:
p.write("A first sentence\nAnother sentence\nAnd a last one")
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train(
files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
)
tokenizer.train(files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>")
output = tokenizer.encode("A sentence 🤗")
assert output.ids[-1] == 0
assert output.tokens == ["▁A", "", "s", "en", "t", "en", "c", "e", "", "🤗"]

View File

@ -1,11 +1,14 @@
from tokenizers import Tokenizer
import json
import os
import unittest
from .utils import data_dir, albert_base
import json
from huggingface_hub import HfApi, hf_hub_url, cached_download
import tqdm
from huggingface_hub import HfApi, cached_download, hf_hub_url
from tokenizers import Tokenizer
from .utils import albert_base, data_dir
class TestSerialization:
def test_full_serialization_albert(self, albert_base):

View File

@ -1,8 +1,11 @@
import multiprocessing as mp
import os
import requests
import pytest
import requests
DATA_PATH = os.path.join("tests", "data")
@ -29,33 +32,23 @@ def data_dir():
@pytest.fixture(scope="session")
def roberta_files(data_dir):
return {
"vocab": download(
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
),
"merges": download(
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
),
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"),
"merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"),
}
@pytest.fixture(scope="session")
def bert_files(data_dir):
return {
"vocab": download(
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
),
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"),
}
@pytest.fixture(scope="session")
def openai_files(data_dir):
return {
"vocab": download(
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"
),
"merges": download(
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
),
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"),
"merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"),
}
@ -77,9 +70,7 @@ def train_files(data_dir):
@pytest.fixture(scope="session")
def albert_base(data_dir):
return download(
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"
)
return download("https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json")
@pytest.fixture(scope="session")