mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Updating python formatting. (#1079)
* Updating python formatting. * Forgot gh action. * Skipping isort to prevent circular imports. * Updating stub. * Removing `isort` (it contradicts `stub.py`). * Fixing weird stub black/isort disagreeement.
This commit is contained in:
3
.github/workflows/python.yml
vendored
3
.github/workflows/python.yml
vendored
@ -102,13 +102,12 @@ jobs:
|
||||
source .env/bin/activate
|
||||
pip install -U pip
|
||||
pip install pytest requests setuptools_rust numpy pyarrow datasets
|
||||
python setup.py develop
|
||||
pip install -e .[dev]
|
||||
|
||||
- name: Check style
|
||||
working-directory: ./bindings/python
|
||||
run: |
|
||||
source .env/bin/activate
|
||||
pip install black==20.8b1 click==8.0.4
|
||||
make check-style
|
||||
|
||||
- name: Run tests
|
||||
|
@ -3,16 +3,17 @@
|
||||
DATA_DIR = data
|
||||
|
||||
dir_guard=@mkdir -p $(@D)
|
||||
check_dirs := examples py_src/tokenizers tests
|
||||
|
||||
# Format source code automatically
|
||||
style:
|
||||
python stub.py
|
||||
black --line-length 100 --target-version py36 examples py_src/tokenizers tests
|
||||
black --line-length 119 --target-version py35 $(check_dirs)
|
||||
|
||||
# Check the source code is formatted correctly
|
||||
check-style:
|
||||
python stub.py --check
|
||||
black --check --line-length 100 --target-version py36 examples py_src/tokenizers tests
|
||||
black --check --line-length 119 --target-version py35 examples py_src/tokenizers tests
|
||||
|
||||
TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json
|
||||
|
||||
|
@ -1,12 +1,11 @@
|
||||
import jieba
|
||||
|
||||
from typing import List
|
||||
|
||||
from tokenizers import Tokenizer, Regex, NormalizedString, PreTokenizedString
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.pre_tokenizers import PreTokenizer
|
||||
from tokenizers.normalizers import Normalizer
|
||||
import jieba
|
||||
from tokenizers import NormalizedString, PreTokenizedString, Regex, Tokenizer
|
||||
from tokenizers.decoders import Decoder
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import Normalizer
|
||||
from tokenizers.pre_tokenizers import PreTokenizer
|
||||
|
||||
|
||||
class JiebaPreTokenizer:
|
||||
@ -21,9 +20,7 @@ class JiebaPreTokenizer:
|
||||
# We can also easily do it in one line:
|
||||
# return [normalized_string[w[1] : w[2]] for w in jieba.tokenize(str(normalized_string))]
|
||||
|
||||
def odd_number_split(
|
||||
self, i: int, normalized_string: NormalizedString
|
||||
) -> List[NormalizedString]:
|
||||
def odd_number_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
||||
# Just an odd example...
|
||||
splits = []
|
||||
last = 0
|
||||
|
@ -1,18 +1,19 @@
|
||||
import time
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
import logging
|
||||
|
||||
logging.getLogger("transformers").disabled = True
|
||||
logging.getLogger("transformers.tokenization_utils").disabled = True
|
||||
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||
from tokenizers import Tokenizer, decoders, pre_tokenizers
|
||||
from tokenizers.models import BPE, WordPiece
|
||||
from tokenizers.processors import BertProcessing
|
||||
from tokenizers.normalizers import BertNormalizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
from transformers import BertTokenizer, GPT2Tokenizer
|
||||
|
||||
from transformers import GPT2Tokenizer, BertTokenizer
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")
|
||||
|
@ -3,6 +3,7 @@ import glob
|
||||
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--files",
|
||||
@ -19,9 +20,7 @@ parser.add_argument(
|
||||
type=str,
|
||||
help="Path to the output directory, where the files will be saved",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--name", default="bert-wordpiece", type=str, help="The name of the output vocab files"
|
||||
)
|
||||
parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files")
|
||||
args = parser.parse_args()
|
||||
|
||||
files = glob.glob(args.files)
|
||||
|
@ -4,6 +4,7 @@ from os.path import join
|
||||
|
||||
from tokenizers import ByteLevelBPETokenizer
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--files",
|
||||
@ -20,9 +21,7 @@ parser.add_argument(
|
||||
type=str,
|
||||
help="Path to the output directory, where the files will be saved",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files"
|
||||
)
|
||||
parser.add_argument("--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files")
|
||||
args = parser.parse_args()
|
||||
|
||||
files = glob.glob(args.files)
|
||||
|
@ -1,5 +1,7 @@
|
||||
import datasets
|
||||
from tokenizers import normalizers, pre_tokenizers, Tokenizer, models, trainers
|
||||
|
||||
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
|
||||
|
||||
|
||||
# Build a tokenizer
|
||||
bpe_tokenizer = Tokenizer(models.BPE())
|
||||
|
@ -1,7 +1,8 @@
|
||||
__version__ = "0.13.1.dev0"
|
||||
|
||||
from typing import Tuple, Union, Tuple, List
|
||||
from enum import Enum
|
||||
from typing import List, Tuple, Union
|
||||
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
@ -77,25 +78,24 @@ class SplitDelimiterBehavior(Enum):
|
||||
|
||||
|
||||
from .tokenizers import (
|
||||
Tokenizer,
|
||||
Encoding,
|
||||
AddedToken,
|
||||
Regex,
|
||||
Encoding,
|
||||
NormalizedString,
|
||||
PreTokenizedString,
|
||||
Regex,
|
||||
Token,
|
||||
Tokenizer,
|
||||
decoders,
|
||||
models,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
processors,
|
||||
trainers,
|
||||
)
|
||||
from .tokenizers import decoders
|
||||
from .tokenizers import models
|
||||
from .tokenizers import normalizers
|
||||
from .tokenizers import pre_tokenizers
|
||||
from .tokenizers import processors
|
||||
from .tokenizers import trainers
|
||||
|
||||
from .implementations import (
|
||||
BertWordPieceTokenizer,
|
||||
ByteLevelBPETokenizer,
|
||||
CharBPETokenizer,
|
||||
SentencePieceBPETokenizer,
|
||||
SentencePieceUnigramTokenizer,
|
||||
BertWordPieceTokenizer,
|
||||
)
|
||||
|
@ -709,13 +709,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def enable_padding(
|
||||
self,
|
||||
direction="right",
|
||||
pad_id=0,
|
||||
pad_type_id=0,
|
||||
pad_token="[PAD]",
|
||||
length=None,
|
||||
pad_to_multiple_of=None,
|
||||
self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None
|
||||
):
|
||||
"""
|
||||
Enable the padding
|
||||
|
@ -1,5 +1,6 @@
|
||||
from .. import decoders
|
||||
|
||||
|
||||
Decoder = decoders.Decoder
|
||||
ByteLevel = decoders.ByteLevel
|
||||
WordPiece = decoders.WordPiece
|
||||
|
@ -1,6 +1,6 @@
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
from .bert_wordpiece import BertWordPieceTokenizer
|
||||
from .byte_level_bpe import ByteLevelBPETokenizer
|
||||
from .char_level_bpe import CharBPETokenizer
|
||||
from .sentencepiece_bpe import SentencePieceBPETokenizer
|
||||
from .sentencepiece_unigram import SentencePieceUnigramTokenizer
|
||||
from .bert_wordpiece import BertWordPieceTokenizer
|
||||
|
@ -1,11 +1,12 @@
|
||||
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from tokenizers import AddedToken, EncodeInput, Encoding, InputSequence, Tokenizer
|
||||
from tokenizers.decoders import Decoder
|
||||
from tokenizers.models import Model
|
||||
from tokenizers.normalizers import Normalizer
|
||||
from tokenizers.pre_tokenizers import PreTokenizer
|
||||
from tokenizers.processors import PostProcessor
|
||||
from tokenizers.decoders import Decoder
|
||||
|
||||
from typing import List, Union, Tuple, Optional, Dict
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
@ -109,9 +110,7 @@ class BaseTokenizer:
|
||||
"""
|
||||
return self._tokenizer.padding
|
||||
|
||||
def enable_truncation(
|
||||
self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
|
||||
):
|
||||
def enable_truncation(self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"):
|
||||
"""Change the truncation options
|
||||
|
||||
Args:
|
||||
@ -270,9 +269,7 @@ class BaseTokenizer:
|
||||
|
||||
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
def decode_batch(
|
||||
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
|
||||
) -> str:
|
||||
def decode_batch(self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
"""Decode the list of sequences to a list of string sequences
|
||||
|
||||
Args:
|
||||
|
@ -1,11 +1,12 @@
|
||||
from tokenizers import Tokenizer, AddedToken, decoders, trainers
|
||||
from typing import Dict, Iterator, List, Optional, Union
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer, decoders, trainers
|
||||
from tokenizers.models import WordPiece
|
||||
from tokenizers.normalizers import BertNormalizer
|
||||
from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union, Dict, Iterator
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class BertWordPieceTokenizer(BaseTokenizer):
|
||||
@ -59,9 +60,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
if cls_token_id is None:
|
||||
raise TypeError("cls_token not found in the vocabulary")
|
||||
|
||||
tokenizer.post_processor = BertProcessing(
|
||||
(str(sep_token), sep_token_id), (str(cls_token), cls_token_id)
|
||||
)
|
||||
tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
|
||||
tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
|
||||
|
||||
parameters = {
|
||||
|
@ -1,16 +1,10 @@
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
AddedToken,
|
||||
pre_tokenizers,
|
||||
decoders,
|
||||
trainers,
|
||||
processors,
|
||||
)
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
from typing import Optional, List, Union, Dict, Tuple, Iterator
|
||||
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, processors, trainers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import Lowercase, Sequence, unicode_normalizer_from_str
|
||||
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
|
@ -1,14 +1,9 @@
|
||||
from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
|
||||
from ..models import BPE
|
||||
from ..normalizers import (
|
||||
Sequence,
|
||||
Lowercase,
|
||||
unicode_normalizer_from_str,
|
||||
BertNormalizer,
|
||||
)
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
from typing import Optional, List, Union, Dict, Tuple, Iterator
|
||||
from .. import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
|
||||
from ..models import BPE
|
||||
from ..normalizers import BertNormalizer, Lowercase, Sequence, unicode_normalizer_from_str
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class CharBPETokenizer(BaseTokenizer):
|
||||
|
@ -1,9 +1,10 @@
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
|
||||
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer, decoders, pre_tokenizers, trainers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union, Dict, Tuple, Iterator
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
@ -23,9 +24,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
fuse_unk: Optional[bool] = False,
|
||||
):
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk)
|
||||
)
|
||||
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
@ -33,12 +32,8 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
tokenizer.add_special_tokens([str(unk_token)])
|
||||
|
||||
tokenizer.normalizer = NFKC()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceBPE",
|
||||
|
@ -1,10 +1,11 @@
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, normalizers, Regex
|
||||
import os
|
||||
from tokenizers.models import Unigram
|
||||
import json
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
import os
|
||||
from typing import Iterator, List, Optional, Union
|
||||
|
||||
from typing import Optional, List, Union, Iterator
|
||||
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
|
||||
from tokenizers.models import Unigram
|
||||
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
|
||||
class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
@ -28,12 +29,8 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceUnigram",
|
||||
@ -181,12 +178,8 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
normalizers.Replace(Regex(" {2,}"), " "),
|
||||
]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceUnigram",
|
||||
|
@ -1,5 +1,6 @@
|
||||
from .. import normalizers
|
||||
|
||||
|
||||
Normalizer = normalizers.Normalizer
|
||||
BertNormalizer = normalizers.BertNormalizer
|
||||
NFD = normalizers.NFD
|
||||
@ -21,9 +22,7 @@ NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
if normalizer not in NORMALIZERS:
|
||||
raise ValueError(
|
||||
"{} is not a known unicode normalizer. Available are {}".format(
|
||||
normalizer, NORMALIZERS.keys()
|
||||
)
|
||||
"{} is not a known unicode normalizer. Available are {}".format(normalizer, NORMALIZERS.keys())
|
||||
)
|
||||
|
||||
return NORMALIZERS[normalizer]()
|
||||
|
@ -63,9 +63,7 @@ class BertNormalizer(Normalizer):
|
||||
Whether to lowercase.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True
|
||||
):
|
||||
def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True):
|
||||
pass
|
||||
def normalize(self, normalized):
|
||||
"""
|
||||
|
@ -1 +1 @@
|
||||
from .visualizer import EncodingVisualizer, Annotation
|
||||
from .visualizer import Annotation, EncodingVisualizer
|
||||
|
@ -1,11 +1,11 @@
|
||||
import os
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
from typing import List, Optional, Tuple, Dict, Callable, Any, NamedTuple
|
||||
from string import Template
|
||||
from typing import List
|
||||
from typing import Any, Callable, Dict, List, NamedTuple, Optional, Tuple
|
||||
|
||||
from tokenizers import Encoding, Tokenizer
|
||||
|
||||
from tokenizers import Tokenizer, Encoding
|
||||
|
||||
dirname = os.path.dirname(__file__)
|
||||
css_filename = os.path.join(dirname, "visualizer-styles.css")
|
||||
@ -91,7 +91,7 @@ class EncodingVisualizer:
|
||||
):
|
||||
if default_to_notebook:
|
||||
try:
|
||||
from IPython.core.display import display, HTML
|
||||
from IPython.core.display import HTML, display
|
||||
except ImportError as e:
|
||||
raise Exception(
|
||||
"""We couldn't import IPython utils for html display.
|
||||
@ -135,7 +135,7 @@ class EncodingVisualizer:
|
||||
final_default_to_notebook = default_to_notebook
|
||||
if final_default_to_notebook:
|
||||
try:
|
||||
from IPython.core.display import display, HTML
|
||||
from IPython.core.display import HTML, display
|
||||
except ImportError as e:
|
||||
raise Exception(
|
||||
"""We couldn't import IPython utils for html display.
|
||||
@ -174,9 +174,7 @@ class EncodingVisualizer:
|
||||
h = 10
|
||||
colors = {}
|
||||
|
||||
for label in sorted(
|
||||
labels
|
||||
): # sort so we always get the same colors for a given set of labels
|
||||
for label in sorted(labels): # sort so we always get the same colors for a given set of labels
|
||||
colors[label] = f"hsl({h},{s}%,{l}%"
|
||||
h += h_step
|
||||
return colors
|
||||
@ -234,10 +232,7 @@ class EncodingVisualizer:
|
||||
else:
|
||||
# Like above, but a different color so we can see the tokens alternate
|
||||
css_classes.append("even-token")
|
||||
if (
|
||||
EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix])
|
||||
is not None
|
||||
):
|
||||
if EncodingVisualizer.unk_token_regex.search(encoding.tokens[first.token_ix]) is not None:
|
||||
# This is a special token that is in the text. probably UNK
|
||||
css_classes.append("special-token")
|
||||
# TODO is this the right name for the data attribute ?
|
||||
@ -289,9 +284,7 @@ class EncodingVisualizer:
|
||||
anno = annotations[cur_anno_ix]
|
||||
label = anno.label
|
||||
color = label_colors_dict[label]
|
||||
spans.append(
|
||||
f'<span class="annotation" style="color:{color}" data-label="{label}">'
|
||||
)
|
||||
spans.append(f'<span class="annotation" style="color:{color}" data-label="{label}">')
|
||||
prev_anno_ix = cur_anno_ix
|
||||
|
||||
if cs.partition_key() == current_consecutive_chars[0].partition_key():
|
||||
@ -342,9 +335,7 @@ class EncodingVisualizer:
|
||||
return annotation_map
|
||||
|
||||
@staticmethod
|
||||
def __make_char_states(
|
||||
text: str, encoding: Encoding, annotations: AnnotationList
|
||||
) -> List[CharState]:
|
||||
def __make_char_states(text: str, encoding: Encoding, annotations: AnnotationList) -> List[CharState]:
|
||||
"""
|
||||
For each character in the original text, we emit a tuple representing it's "state":
|
||||
|
||||
|
@ -3,5 +3,5 @@ requires = ["setuptools", "wheel", "setuptools-rust"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.black]
|
||||
target-version = ['py36']
|
||||
line-length = 100
|
||||
target-version = ['py35']
|
||||
line-length = 119
|
||||
|
54
bindings/python/setup.cfg
Normal file
54
bindings/python/setup.cfg
Normal file
@ -0,0 +1,54 @@
|
||||
[isort]
|
||||
default_section = FIRSTPARTY
|
||||
ensure_newline_before_comments = True
|
||||
force_grid_wrap = 0
|
||||
include_trailing_comma = True
|
||||
known_first_party = transformers
|
||||
known_third_party =
|
||||
absl
|
||||
conllu
|
||||
datasets
|
||||
elasticsearch
|
||||
fairseq
|
||||
faiss-cpu
|
||||
fastprogress
|
||||
fire
|
||||
fugashi
|
||||
git
|
||||
h5py
|
||||
matplotlib
|
||||
nltk
|
||||
numpy
|
||||
packaging
|
||||
pandas
|
||||
PIL
|
||||
psutil
|
||||
pytest
|
||||
pytorch_lightning
|
||||
rouge_score
|
||||
sacrebleu
|
||||
seqeval
|
||||
sklearn
|
||||
streamlit
|
||||
tensorboardX
|
||||
tensorflow
|
||||
tensorflow_datasets
|
||||
timeout_decorator
|
||||
torch
|
||||
torchaudio
|
||||
torchtext
|
||||
torchvision
|
||||
torch_xla
|
||||
tqdm
|
||||
|
||||
line_length = 119
|
||||
lines_after_imports = 2
|
||||
multi_line_output = 3
|
||||
use_parentheses = True
|
||||
|
||||
[flake8]
|
||||
ignore = E203, E501, E741, W503, W605
|
||||
max-line-length = 119
|
||||
|
||||
[tool:pytest]
|
||||
doctest_optionflags=NUMBER NORMALIZE_WHITESPACE ELLIPSIS
|
@ -1,8 +1,9 @@
|
||||
from setuptools import setup
|
||||
from setuptools_rust import Binding, RustExtension
|
||||
|
||||
|
||||
extras = {}
|
||||
extras["testing"] = ["pytest", "requests", "numpy", "datasets"]
|
||||
extras["testing"] = ["pytest", "requests", "numpy", "datasets", "black==22.3"]
|
||||
extras["docs"] = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"]
|
||||
extras["dev"] = extras["testing"]
|
||||
|
||||
|
@ -1,9 +1,11 @@
|
||||
import argparse
|
||||
import inspect
|
||||
import os
|
||||
import argparse
|
||||
import black
|
||||
from pathlib import Path
|
||||
|
||||
import black
|
||||
|
||||
|
||||
INDENT = " " * 4
|
||||
GENERATED_COMMENT = "# Generated content DO NOT EDIT\n"
|
||||
|
||||
@ -122,8 +124,8 @@ def py_file(module, origin):
|
||||
|
||||
def do_black(content, is_pyi):
|
||||
mode = black.Mode(
|
||||
target_versions={black.TargetVersion.PY36},
|
||||
line_length=100,
|
||||
target_versions={black.TargetVersion.PY35},
|
||||
line_length=119,
|
||||
is_pyi=is_pyi,
|
||||
string_normalization=True,
|
||||
experimental_string_processing=False,
|
||||
@ -135,9 +137,7 @@ def do_black(content, is_pyi):
|
||||
|
||||
|
||||
def write(module, directory, origin, check=False):
|
||||
submodules = [
|
||||
(name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member)
|
||||
]
|
||||
submodules = [(name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member)]
|
||||
|
||||
filename = os.path.join(directory, "__init__.pyi")
|
||||
pyi_content = pyi_file(module)
|
||||
@ -146,9 +146,7 @@ def write(module, directory, origin, check=False):
|
||||
if check:
|
||||
with open(filename, "r") as f:
|
||||
data = f.read()
|
||||
assert (
|
||||
data == pyi_content
|
||||
), f"The content of {filename} seems outdated, please run `python stub.py`"
|
||||
assert data == pyi_content, f"The content of {filename} seems outdated, please run `python stub.py`"
|
||||
else:
|
||||
with open(filename, "w") as f:
|
||||
f.write(pyi_content)
|
||||
@ -171,9 +169,7 @@ def write(module, directory, origin, check=False):
|
||||
if check:
|
||||
with open(filename, "r") as f:
|
||||
data = f.read()
|
||||
assert (
|
||||
data == py_content
|
||||
), f"The content of {filename} seems outdated, please run `python stub.py`"
|
||||
assert data == py_content, f"The content of {filename} seems outdated, please run `python stub.py`"
|
||||
else:
|
||||
with open(filename, "w") as f:
|
||||
f.write(py_content)
|
||||
|
@ -1,8 +1,9 @@
|
||||
import pytest
|
||||
import pickle
|
||||
import json
|
||||
import pickle
|
||||
|
||||
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder, CTC, Sequence
|
||||
import pytest
|
||||
|
||||
from tokenizers.decoders import CTC, BPEDecoder, ByteLevel, Decoder, Metaspace, Sequence, WordPiece
|
||||
|
||||
|
||||
class TestByteLevel:
|
||||
@ -93,10 +94,7 @@ class TestBPEDecoder:
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = BPEDecoder()
|
||||
assert (
|
||||
decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"])
|
||||
== "My name is John"
|
||||
)
|
||||
assert decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"]) == "My name is John"
|
||||
decoder = BPEDecoder(suffix="_")
|
||||
assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == "My name is John"
|
||||
|
||||
@ -121,16 +119,12 @@ class TestCTCDecoder:
|
||||
def test_decoding(self):
|
||||
decoder = CTC()
|
||||
assert (
|
||||
decoder.decode(
|
||||
["<pad>", "<pad>", "h", "e", "e", "l", "l", "<pad>", "l", "o", "o", "o", "<pad>"]
|
||||
)
|
||||
decoder.decode(["<pad>", "<pad>", "h", "e", "e", "l", "l", "<pad>", "l", "o", "o", "o", "<pad>"])
|
||||
== "hello"
|
||||
)
|
||||
decoder = CTC(pad_token="[PAD]")
|
||||
assert (
|
||||
decoder.decode(
|
||||
["[PAD]", "[PAD]", "h", "e", "e", "l", "l", "[PAD]", "l", "o", "o", "o", "[PAD]"]
|
||||
)
|
||||
decoder.decode(["[PAD]", "[PAD]", "h", "e", "e", "l", "l", "[PAD]", "l", "o", "o", "o", "[PAD]"])
|
||||
== "hello"
|
||||
)
|
||||
|
||||
|
@ -1,8 +1,9 @@
|
||||
import pytest
|
||||
from ..utils import data_dir, bert_files
|
||||
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
|
||||
from ..utils import bert_files, data_dir
|
||||
|
||||
|
||||
class TestEncoding:
|
||||
@pytest.fixture(scope="class")
|
||||
|
@ -1,9 +1,10 @@
|
||||
import pytest
|
||||
import pickle
|
||||
|
||||
from ..utils import data_dir, roberta_files, bert_files
|
||||
import pytest
|
||||
|
||||
from tokenizers.models import Model, BPE, WordPiece, WordLevel
|
||||
from tokenizers.models import BPE, Model, WordLevel, WordPiece
|
||||
|
||||
from ..utils import bert_files, data_dir, roberta_files
|
||||
|
||||
|
||||
class TestBPE:
|
||||
|
@ -1,9 +1,10 @@
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from tokenizers import Tokenizer, NormalizedString
|
||||
from tokenizers import NormalizedString, Tokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import Normalizer, BertNormalizer, Sequence, Lowercase, Strip
|
||||
from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip
|
||||
|
||||
|
||||
class TestBertNormalizer:
|
||||
@ -13,41 +14,31 @@ class TestBertNormalizer:
|
||||
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
|
||||
|
||||
def test_strip_accents(self):
|
||||
normalizer = BertNormalizer(
|
||||
strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
|
||||
)
|
||||
normalizer = BertNormalizer(strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False)
|
||||
|
||||
output = normalizer.normalize_str("Héllò")
|
||||
assert output == "Hello"
|
||||
|
||||
def test_handle_chinese_chars(self):
|
||||
normalizer = BertNormalizer(
|
||||
strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False
|
||||
)
|
||||
normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False)
|
||||
|
||||
output = normalizer.normalize_str("你好")
|
||||
assert output == " 你 好 "
|
||||
|
||||
def test_clean_text(self):
|
||||
normalizer = BertNormalizer(
|
||||
strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True
|
||||
)
|
||||
normalizer = BertNormalizer(strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True)
|
||||
|
||||
output = normalizer.normalize_str("\ufeffHello")
|
||||
assert output == "Hello"
|
||||
|
||||
def test_lowercase(self):
|
||||
normalizer = BertNormalizer(
|
||||
strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False
|
||||
)
|
||||
normalizer = BertNormalizer(strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False)
|
||||
|
||||
output = normalizer.normalize_str("Héllò")
|
||||
assert output == "héllò"
|
||||
|
||||
def test_can_modify(self):
|
||||
normalizer = BertNormalizer(
|
||||
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True
|
||||
)
|
||||
normalizer = BertNormalizer(clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True)
|
||||
|
||||
assert normalizer.clean_text == True
|
||||
assert normalizer.handle_chinese_chars == True
|
||||
@ -151,9 +142,7 @@ class TestCustomNormalizer:
|
||||
with pytest.raises(Exception, match="TypeError:.*normalize()"):
|
||||
bad.normalize_str("Hey there!")
|
||||
assert good.normalize_str("Hey there!") == "Hey you!"
|
||||
with pytest.raises(
|
||||
Exception, match="Cannot use a NormalizedStringRefMut outside `normalize`"
|
||||
):
|
||||
with pytest.raises(Exception, match="Cannot use a NormalizedStringRefMut outside `normalize`"):
|
||||
good_custom.use_after_normalize()
|
||||
|
||||
def test_normalizer_interface(self):
|
||||
|
@ -1,20 +1,21 @@
|
||||
import pytest
|
||||
import pickle
|
||||
import json
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from tokenizers.pre_tokenizers import (
|
||||
PreTokenizer,
|
||||
ByteLevel,
|
||||
Whitespace,
|
||||
WhitespaceSplit,
|
||||
BertPreTokenizer,
|
||||
Metaspace,
|
||||
ByteLevel,
|
||||
CharDelimiterSplit,
|
||||
Digits,
|
||||
Metaspace,
|
||||
PreTokenizer,
|
||||
Punctuation,
|
||||
Sequence,
|
||||
Digits,
|
||||
UnicodeScripts,
|
||||
Split,
|
||||
UnicodeScripts,
|
||||
Whitespace,
|
||||
WhitespaceSplit,
|
||||
)
|
||||
|
||||
|
||||
|
@ -1,21 +1,22 @@
|
||||
import pytest
|
||||
import pickle
|
||||
import json
|
||||
import pickle
|
||||
|
||||
from ..utils import data_dir, roberta_files
|
||||
import pytest
|
||||
|
||||
from tokenizers import Tokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.pre_tokenizers import ByteLevel as ByteLevelPreTokenizer
|
||||
from tokenizers.processors import (
|
||||
PostProcessor,
|
||||
BertProcessing,
|
||||
RobertaProcessing,
|
||||
ByteLevel,
|
||||
TemplateProcessing,
|
||||
PostProcessor,
|
||||
RobertaProcessing,
|
||||
Sequence,
|
||||
TemplateProcessing,
|
||||
)
|
||||
|
||||
from ..utils import data_dir, roberta_files
|
||||
|
||||
|
||||
class TestBertProcessing:
|
||||
def test_instantiate(self):
|
||||
|
@ -1,19 +1,16 @@
|
||||
import numpy as np
|
||||
import pickle
|
||||
import pytest
|
||||
from ..utils import (
|
||||
data_dir,
|
||||
roberta_files,
|
||||
bert_files,
|
||||
multiprocessing_with_parallelism,
|
||||
)
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer, Encoding
|
||||
from tokenizers.models import Model, BPE, WordPiece
|
||||
from tokenizers.pre_tokenizers import ByteLevel
|
||||
from tokenizers.processors import RobertaProcessing, BertProcessing
|
||||
from tokenizers.normalizers import Lowercase
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from tokenizers import AddedToken, Encoding, Tokenizer
|
||||
from tokenizers.implementations import BertWordPieceTokenizer
|
||||
from tokenizers.models import BPE, Model, WordPiece
|
||||
from tokenizers.normalizers import Lowercase
|
||||
from tokenizers.pre_tokenizers import ByteLevel
|
||||
from tokenizers.processors import BertProcessing, RobertaProcessing
|
||||
|
||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
|
||||
|
||||
|
||||
class TestAddedToken:
|
||||
@ -22,8 +19,7 @@ class TestAddedToken:
|
||||
assert type(added_token) == AddedToken
|
||||
assert str(added_token) == "<mask>"
|
||||
assert (
|
||||
repr(added_token)
|
||||
== 'AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=True)'
|
||||
repr(added_token) == 'AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=True)'
|
||||
)
|
||||
assert added_token.rstrip == False
|
||||
assert added_token.lstrip == False
|
||||
|
@ -1,17 +1,19 @@
|
||||
import os
|
||||
import pytest
|
||||
import copy
|
||||
import os
|
||||
import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from tokenizers import (
|
||||
SentencePieceUnigramTokenizer,
|
||||
AddedToken,
|
||||
models,
|
||||
pre_tokenizers,
|
||||
normalizers,
|
||||
SentencePieceUnigramTokenizer,
|
||||
Tokenizer,
|
||||
models,
|
||||
normalizers,
|
||||
pre_tokenizers,
|
||||
trainers,
|
||||
)
|
||||
|
||||
from ..utils import data_dir, train_files
|
||||
|
||||
|
||||
@ -63,15 +65,13 @@ class TestBpeTrainer:
|
||||
trainers.BpeTrainer(min_frequency=12).__getstate__()
|
||||
== b"""{"BpeTrainer":{"min_frequency":12,"vocab_size":30000,"show_progress":true,"special_tokens":[],"limit_alphabet":null,"initial_alphabet":[],"continuing_subword_prefix":null,"end_of_word_suffix":null,"words":{}}}"""
|
||||
)
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12))), trainers.BpeTrainer
|
||||
)
|
||||
assert isinstance(pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12))), trainers.BpeTrainer)
|
||||
|
||||
assert isinstance(copy.deepcopy(trainers.BpeTrainer(min_frequency=12)), trainers.BpeTrainer)
|
||||
# Make sure everything is correct
|
||||
assert pickle.dumps(
|
||||
pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12)))
|
||||
) == pickle.dumps(trainers.BpeTrainer(min_frequency=12))
|
||||
assert pickle.dumps(pickle.loads(pickle.dumps(trainers.BpeTrainer(min_frequency=12)))) == pickle.dumps(
|
||||
trainers.BpeTrainer(min_frequency=12)
|
||||
)
|
||||
|
||||
|
||||
class TestWordPieceTrainer:
|
||||
@ -118,9 +118,7 @@ class TestWordPieceTrainer:
|
||||
assert trainer.continuing_subword_prefix == None
|
||||
|
||||
def test_can_pickle(self):
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(trainers.WordPieceTrainer())), trainers.WordPieceTrainer
|
||||
)
|
||||
assert isinstance(pickle.loads(pickle.dumps(trainers.WordPieceTrainer())), trainers.WordPieceTrainer)
|
||||
|
||||
|
||||
class TestWordLevelTrainer:
|
||||
@ -148,9 +146,7 @@ class TestWordLevelTrainer:
|
||||
assert trainer.special_tokens == []
|
||||
|
||||
def test_can_pickle(self):
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(trainers.WordLevelTrainer())), trainers.WordLevelTrainer
|
||||
)
|
||||
assert isinstance(pickle.loads(pickle.dumps(trainers.WordLevelTrainer())), trainers.WordLevelTrainer)
|
||||
|
||||
|
||||
class TestUnigram:
|
||||
@ -184,9 +180,7 @@ class TestUnigram:
|
||||
bpe_tokenizer.train([train_files["small"]], trainer=trainer)
|
||||
|
||||
def test_can_pickle(self):
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(trainers.UnigramTrainer())), trainers.UnigramTrainer
|
||||
)
|
||||
assert isinstance(pickle.loads(pickle.dumps(trainers.UnigramTrainer())), trainers.UnigramTrainer)
|
||||
|
||||
def test_train_with_special_tokens(self):
|
||||
filename = "tests/data/dummy-unigram-special_tokens-train.txt"
|
||||
|
@ -1,6 +1,7 @@
|
||||
from ..utils import data_dir, doc_wiki_tokenizer, doc_pipeline_bert_tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer
|
||||
|
||||
|
||||
disable_printing = True
|
||||
original_print = print
|
||||
@ -112,7 +113,7 @@ class TestPipeline:
|
||||
# END bert_setup_tokenizer
|
||||
# START bert_setup_normalizer
|
||||
from tokenizers import normalizers
|
||||
from tokenizers.normalizers import Lowercase, NFD, StripAccents
|
||||
from tokenizers.normalizers import NFD, Lowercase, StripAccents
|
||||
|
||||
bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
|
||||
# END bert_setup_normalizer
|
||||
@ -136,9 +137,7 @@ class TestPipeline:
|
||||
# START bert_train_tokenizer
|
||||
from tokenizers.trainers import WordPieceTrainer
|
||||
|
||||
trainer = WordPieceTrainer(
|
||||
vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
|
||||
)
|
||||
trainer = WordPieceTrainer(vocab_size=30522, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
||||
files = [f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"]]
|
||||
bert_tokenizer.train(files, trainer)
|
||||
|
||||
@ -171,9 +170,9 @@ class TestPipeline:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
from urllib import request
|
||||
from zipfile import ZipFile
|
||||
import os
|
||||
|
||||
disable_printing = False
|
||||
if not os.path.isdir("data/wikitext-103-raw"):
|
||||
|
@ -1,8 +1,10 @@
|
||||
from ..utils import data_dir, doc_wiki_tokenizer
|
||||
from tokenizers import Tokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
from tokenizers.pre_tokenizers import Whitespace
|
||||
from tokenizers.trainers import BpeTrainer
|
||||
|
||||
from ..utils import data_dir, doc_wiki_tokenizer
|
||||
|
||||
|
||||
disable_printing = True
|
||||
original_print = print
|
||||
@ -181,9 +183,9 @@ class TestQuicktour:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
from urllib import request
|
||||
from zipfile import ZipFile
|
||||
import os
|
||||
|
||||
disable_printing = False
|
||||
if not os.path.isdir("data/wikitext-103-raw"):
|
||||
|
@ -1,15 +1,17 @@
|
||||
from ..utils import data_dir, train_files
|
||||
import os
|
||||
import pytest
|
||||
import datasets
|
||||
import gzip
|
||||
import os
|
||||
|
||||
import datasets
|
||||
import pytest
|
||||
|
||||
from ..utils import data_dir, train_files
|
||||
|
||||
|
||||
class TestTrainFromIterators:
|
||||
@staticmethod
|
||||
def get_tokenizer_trainer():
|
||||
# START init_tokenizer_trainer
|
||||
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers
|
||||
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers
|
||||
|
||||
tokenizer = Tokenizer(models.Unigram())
|
||||
tokenizer.normalizer = normalizers.NFKC()
|
||||
@ -31,9 +33,7 @@ class TestTrainFromIterators:
|
||||
# START load_dataset
|
||||
import datasets
|
||||
|
||||
dataset = datasets.load_dataset(
|
||||
"wikitext", "wikitext-103-raw-v1", split="train+test+validation"
|
||||
)
|
||||
dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation")
|
||||
# END load_dataset
|
||||
|
||||
@pytest.fixture(scope="class")
|
||||
|
@ -1,7 +1,7 @@
|
||||
import pytest
|
||||
|
||||
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors
|
||||
from tokenizers.implementations import BaseTokenizer
|
||||
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, processors, decoders
|
||||
|
||||
|
||||
class TestBaseTokenizer:
|
||||
|
@ -1,8 +1,9 @@
|
||||
import pytest
|
||||
|
||||
from ..utils import data_dir, bert_files, multiprocessing_with_parallelism
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
|
||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism
|
||||
|
||||
|
||||
class TestBertWordPieceTokenizer:
|
||||
def test_basic_encode(self, bert_files):
|
||||
|
@ -1,8 +1,9 @@
|
||||
import pytest
|
||||
|
||||
from ..utils import data_dir, roberta_files, multiprocessing_with_parallelism
|
||||
from tokenizers import ByteLevelBPETokenizer
|
||||
|
||||
from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files
|
||||
|
||||
|
||||
class TestByteLevelBPE:
|
||||
def test_basic_encode(self, roberta_files):
|
||||
|
@ -1,8 +1,9 @@
|
||||
import pytest
|
||||
|
||||
from ..utils import data_dir, openai_files, multiprocessing_with_parallelism
|
||||
from tokenizers import CharBPETokenizer
|
||||
|
||||
from ..utils import data_dir, multiprocessing_with_parallelism, openai_files
|
||||
|
||||
|
||||
class TestCharBPETokenizer:
|
||||
def test_basic_encode(self, openai_files):
|
||||
@ -33,9 +34,7 @@ class TestCharBPETokenizer:
|
||||
assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
|
||||
|
||||
def test_lowercase(self, openai_files):
|
||||
tokenizer = CharBPETokenizer.from_file(
|
||||
openai_files["vocab"], openai_files["merges"], lowercase=True
|
||||
)
|
||||
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True)
|
||||
output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
|
||||
assert output.ids == [547, 1362, 544, 2476, 2688]
|
||||
assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
|
||||
@ -43,9 +42,7 @@ class TestCharBPETokenizer:
|
||||
assert output.type_ids == [0, 0, 0, 0, 1]
|
||||
|
||||
def test_decoding(self, openai_files):
|
||||
tokenizer = CharBPETokenizer.from_file(
|
||||
openai_files["vocab"], openai_files["merges"], lowercase=True
|
||||
)
|
||||
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"], lowercase=True)
|
||||
decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
|
||||
assert decoded == "my name is john"
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
import os
|
||||
import pytest
|
||||
|
||||
import pytest
|
||||
|
||||
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
||||
|
||||
@ -35,9 +35,7 @@ class TestSentencePieceUnigram:
|
||||
p.write("A first sentence\nAnother sentence\nAnd a last one")
|
||||
|
||||
tokenizer = SentencePieceUnigramTokenizer()
|
||||
tokenizer.train(
|
||||
files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
|
||||
)
|
||||
tokenizer.train(files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>")
|
||||
output = tokenizer.encode("A sentence 🤗")
|
||||
assert output.ids[-1] == 0
|
||||
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
|
||||
|
@ -1,11 +1,14 @@
|
||||
from tokenizers import Tokenizer
|
||||
import json
|
||||
import os
|
||||
import unittest
|
||||
from .utils import data_dir, albert_base
|
||||
import json
|
||||
from huggingface_hub import HfApi, hf_hub_url, cached_download
|
||||
|
||||
import tqdm
|
||||
|
||||
from huggingface_hub import HfApi, cached_download, hf_hub_url
|
||||
from tokenizers import Tokenizer
|
||||
|
||||
from .utils import albert_base, data_dir
|
||||
|
||||
|
||||
class TestSerialization:
|
||||
def test_full_serialization_albert(self, albert_base):
|
||||
|
@ -1,8 +1,11 @@
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import requests
|
||||
|
||||
import pytest
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
DATA_PATH = os.path.join("tests", "data")
|
||||
|
||||
|
||||
@ -29,33 +32,23 @@ def data_dir():
|
||||
@pytest.fixture(scope="session")
|
||||
def roberta_files(data_dir):
|
||||
return {
|
||||
"vocab": download(
|
||||
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
|
||||
),
|
||||
"merges": download(
|
||||
"https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
|
||||
),
|
||||
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"),
|
||||
"merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def bert_files(data_dir):
|
||||
return {
|
||||
"vocab": download(
|
||||
"https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
|
||||
),
|
||||
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def openai_files(data_dir):
|
||||
return {
|
||||
"vocab": download(
|
||||
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"
|
||||
),
|
||||
"merges": download(
|
||||
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
|
||||
),
|
||||
"vocab": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"),
|
||||
"merges": download("https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"),
|
||||
}
|
||||
|
||||
|
||||
@ -77,9 +70,7 @@ def train_files(data_dir):
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def albert_base(data_dir):
|
||||
return download(
|
||||
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"
|
||||
)
|
||||
return download("https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
|
Reference in New Issue
Block a user