Python - Black auto formatting

This commit is contained in:
Anthony MOI
2020-02-18 10:45:36 -05:00
parent 4706151c32
commit 81be207819
16 changed files with 179 additions and 211 deletions

View File

@ -7,28 +7,33 @@ parser.add_argument("--vocab", default=None, type=str, required=True, help="The
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file") parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
args = parser.parse_args() args = parser.parse_args()
class GoodCustom: class GoodCustom:
"""GoodCustom """GoodCustom
This class represents a good custom PreTokenizer that will be called This class represents a good custom PreTokenizer that will be called
by `tokenizers` when needed by `tokenizers` when needed
""" """
def pre_tokenize(self, sentence): def pre_tokenize(self, sentence):
return sentence.split(" ") return sentence.split(" ")
def decode(self, tokens): def decode(self, tokens):
return ", ".join(tokens) return ", ".join(tokens)
class BadCustom: class BadCustom:
"""Bad Pretok """Bad Pretok
This class represents a bad custom PreTokenizer that will trigger an exception This class represents a bad custom PreTokenizer that will trigger an exception
when called by `tokenizers` when called by `tokenizers`
""" """
def pre_tokenize(self, sentence): def pre_tokenize(self, sentence):
return None return None
def decode(self, tokens): def decode(self, tokens):
return None return None
def tokenize(sentence): def tokenize(sentence):
output = tokenizer.encode(sentence).tokens output = tokenizer.encode(sentence).tokens
print(f"`{sentence}` tokenized to {output}") print(f"`{sentence}` tokenized to {output}")
@ -66,4 +71,3 @@ try:
encoding = tokenizer.encode("Hey friend!") encoding = tokenizer.encode("Hey friend!")
except: except:
print("Bad tokenizer didn't work") print("Bad tokenizer didn't work")

View File

@ -3,8 +3,9 @@ import argparse
from tqdm import tqdm from tqdm import tqdm
import logging import logging
logging.getLogger('transformers').disabled = True
logging.getLogger('transformers.tokenization_utils').disabled = True logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True
from tokenizers import Tokenizer, pre_tokenizers, decoders from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import BPE, WordPiece from tokenizers.models import BPE, WordPiece
@ -18,7 +19,7 @@ parser.add_argument("--type", default="gpt2", type=str, help="The type of tokeni
parser.add_argument("--file", default=None, type=str, help="The file to encode") parser.add_argument("--file", default=None, type=str, help="The file to encode")
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab file") parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab file")
parser.add_argument("--merges", default=None, type=str, help="The merges.txt file") parser.add_argument("--merges", default=None, type=str, help="The merges.txt file")
parser.add_argument("--debug", action='store_true', help="Verbose output") parser.add_argument("--debug", action="store_true", help="Verbose output")
args = parser.parse_args() args = parser.parse_args()
if args.type == "gpt2" and args.merges is None: if args.type == "gpt2" and args.merges is None:
@ -49,11 +50,13 @@ Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea. If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea. If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those! Namespaces are one honking great idea -- let's do more of those!
""".split("\n") """.split(
"\n"
)
if args.type == "gpt2": if args.type == "gpt2":
print("Running GPT-2 tokenizer") print("Running GPT-2 tokenizer")
tok_p = GPT2Tokenizer.from_pretrained('gpt2') tok_p = GPT2Tokenizer.from_pretrained("gpt2")
# Create a Tokenizer using BPE # Create a Tokenizer using BPE
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
@ -65,33 +68,30 @@ elif args.type == "bert":
print("Running Bert tokenizer") print("Running Bert tokenizer")
tok_p = BertTokenizer.from_pretrained(args.vocab) tok_p = BertTokenizer.from_pretrained(args.vocab)
tok_r = Tokenizer(WordPiece.from_files( tok_r = Tokenizer(
args.vocab, WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)
unk_token="[UNK]",
max_input_chars_per_word=100)
) )
tok_r.normalizer = BertNormalizer( tok_r.normalizer = BertNormalizer(
clean_text=True, clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
) )
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace() # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
tok_r.decoder = decoders.WordPiece() tok_r.decoder = decoders.WordPiece()
tok_r.post_processor = BertProcessing( tok_r.post_processor = BertProcessing(
("[SEP]", tok_r.token_to_id("[SEP]")), ("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")),
("[CLS]", tok_r.token_to_id("[CLS]")),
) )
else: else:
raise Exception(f"Unknown type {args.type}") raise Exception(f"Unknown type {args.type}")
def tokenize_r(): def tokenize_r():
return tok_r.encode_batch(text); return tok_r.encode_batch(text)
def tokenize_p(): def tokenize_p():
return [tok_p.encode(sentence, add_special_tokens=True) for sentence in tqdm(text)] return [tok_p.encode(sentence, add_special_tokens=True) for sentence in tqdm(text)]
print(f"Tokenizing {len(text)} lines") print(f"Tokenizing {len(text)} lines")
# Rust version # Rust version

View File

@ -4,21 +4,24 @@ import glob
from tokenizers import BertWordPieceTokenizer from tokenizers import BertWordPieceTokenizer
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--files", parser.add_argument(
"--files",
default=None, default=None,
metavar="path", metavar="path",
type=str, type=str,
required=True, required=True,
help="The files to use as training; accept '**/*.txt' type of patterns \ help="The files to use as training; accept '**/*.txt' type of patterns \
if enclosed in quotes") if enclosed in quotes",
parser.add_argument("--out", )
parser.add_argument(
"--out",
default="./", default="./",
type=str, type=str,
help="Path to the output directory, where the files will be saved") help="Path to the output directory, where the files will be saved",
parser.add_argument("--name", )
default="bert-wordpiece", parser.add_argument(
type=str, "--name", default="bert-wordpiece", type=str, help="The name of the output vocab files"
help="The name of the output vocab files") )
args = parser.parse_args() args = parser.parse_args()
files = glob.glob(args.files) files = glob.glob(args.files)
@ -29,11 +32,7 @@ if not files:
# Initialize an empty tokenizer # Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer( tokenizer = BertWordPieceTokenizer(
clean_text=True, clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
) )
# And then train # And then train
@ -44,7 +43,7 @@ trainer = tokenizer.train(
show_progress=True, show_progress=True,
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
limit_alphabet=1000, limit_alphabet=1000,
wordpieces_prefix="##" wordpieces_prefix="##",
) )
# Save the files # Save the files

View File

@ -5,21 +5,24 @@ from os.path import join
from tokenizers import ByteLevelBPETokenizer from tokenizers import ByteLevelBPETokenizer
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--files", parser.add_argument(
"--files",
default=None, default=None,
metavar="path", metavar="path",
type=str, type=str,
required=True, required=True,
help="The files to use as training; accept '**/*.txt' type of patterns \ help="The files to use as training; accept '**/*.txt' type of patterns \
if enclosed in quotes") if enclosed in quotes",
parser.add_argument("--out", )
parser.add_argument(
"--out",
default="./", default="./",
type=str, type=str,
help="Path to the output directory, where the files will be saved") help="Path to the output directory, where the files will be saved",
parser.add_argument("--name", )
default="bpe-bytelevel", parser.add_argument(
type=str, "--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files"
help="The name of the output vocab files") )
args = parser.parse_args() args = parser.parse_args()
files = glob.glob(args.files) files = glob.glob(args.files)
@ -47,7 +50,7 @@ tokenizer.save(args.out, args.name)
tokenizer = ByteLevelBPETokenizer( tokenizer = ByteLevelBPETokenizer(
join(args.out, "{}-vocab.json".format(args.name)), join(args.out, "{}-vocab.json".format(args.name)),
join(args.out, "{}-merges.txt".format(args.name)), join(args.out, "{}-merges.txt".format(args.name)),
add_prefix_space=True add_prefix_space=True,
) )
# Test encoding # Test encoding

View File

@ -11,5 +11,5 @@ from .implementations import (
ByteLevelBPETokenizer, ByteLevelBPETokenizer,
CharBPETokenizer, CharBPETokenizer,
SentencePieceBPETokenizer, SentencePieceBPETokenizer,
BertWordPieceTokenizer BertWordPieceTokenizer,
) )

View File

@ -9,7 +9,7 @@ from .implementations import (
ByteLevelBPETokenizer as ByteLevelBPETokenizer, ByteLevelBPETokenizer as ByteLevelBPETokenizer,
BPETokenizer as BPETokenizer, BPETokenizer as BPETokenizer,
SentencePieceBPETokenizer as SentencePieceBPETokenizer, SentencePieceBPETokenizer as SentencePieceBPETokenizer,
BertWordPieceTokenizer as BertWordPieceTokenizer BertWordPieceTokenizer as BertWordPieceTokenizer,
) )
from typing import Optional, Union, List, Tuple from typing import Optional, Union, List, Tuple
@ -38,27 +38,22 @@ class Encoding:
def normalized_str(self) -> IndexableString: def normalized_str(self) -> IndexableString:
""" The normalized string """ """ The normalized string """
pass pass
@property @property
def original_str(self) -> IndexableString: def original_str(self) -> IndexableString:
""" The original string """ """ The original string """
pass pass
@property @property
def ids(self) -> List[int]: def ids(self) -> List[int]:
""" The tokenized ids """ """ The tokenized ids """
pass pass
@property @property
def tokens(self) -> List[str]: def tokens(self) -> List[str]:
""" The tokenized strings """ """ The tokenized strings """
pass pass
@property @property
def type_ids(self) -> List[int]: def type_ids(self) -> List[int]:
""" The type ids """ """ The type ids """
pass pass
@property @property
def offsets(self) -> List[Offsets]: def offsets(self) -> List[Offsets]:
""" The offsets. """ The offsets.
@ -67,28 +62,26 @@ class Encoding:
method on the `original_str`. method on the `original_str`.
""" """
pass pass
@property @property
def special_tokens_mask(self) -> List[int]: def special_tokens_mask(self) -> List[int]:
""" The special tokens mask """ """ The special tokens mask """
pass pass
@property @property
def attention_mask(self) -> List[int]: def attention_mask(self) -> List[int]:
""" The attention mask """ """ The attention mask """
pass pass
@property @property
def overflowing(self) -> Optional[Encoding]: def overflowing(self) -> Optional[Encoding]:
""" The overflowing encoding, after truncation """ """ The overflowing encoding, after truncation """
pass pass
def pad(
def pad(self, self,
length: int, length: int,
pad_id: Optional[int] = 0, pad_id: Optional[int] = 0,
pad_type_id: Optional[int] = 0, pad_type_id: Optional[int] = 0,
pad_token: Optional[str] = "[PAD]", pad_token: Optional[str] = "[PAD]",
direction: Optional[str] = "right"): direction: Optional[str] = "right",
):
""" Pad the current Encoding at the given length """ Pad the current Encoding at the given length
Args: Args:
@ -108,7 +101,6 @@ class Encoding:
The pad token to be used when padding The pad token to be used when padding
""" """
pass pass
def truncate(self, max_length: int, stride: Optional[int] = 0): def truncate(self, max_length: int, stride: Optional[int] = 0):
""" Truncate the current Encoding at the given max_length """ Truncate the current Encoding at the given max_length
@ -122,7 +114,6 @@ class Encoding:
""" """
pass pass
class Tokenizer: class Tokenizer:
""" Tokenizer """ Tokenizer
@ -151,56 +142,44 @@ class Tokenizer:
Tokenizer Tokenizer
""" """
pass pass
@property @property
def model(self) -> Model: def model(self) -> Model:
""" Get the model in use with this Tokenizer """ """ Get the model in use with this Tokenizer """
pass pass
@model.setter @model.setter
def model(self, model: models.Model): def model(self, model: models.Model):
""" Change the model to use with this Tokenizer """ """ Change the model to use with this Tokenizer """
pass pass
@property @property
def pre_tokenizer(self) -> Optional[PreTokenizer]: def pre_tokenizer(self) -> Optional[PreTokenizer]:
""" Get the pre-tokenizer in use with this model """ """ Get the pre-tokenizer in use with this model """
pass pass
@pre_tokenizer.setter @pre_tokenizer.setter
def pre_tokenizer(self, pre_tokenizer: pre_tokenizers.PreTokenizer): def pre_tokenizer(self, pre_tokenizer: pre_tokenizers.PreTokenizer):
""" Change the pre tokenizer to use with this Tokenizer """ """ Change the pre tokenizer to use with this Tokenizer """
pass pass
@property @property
def decoder(self) -> Optional[Decoder]: def decoder(self) -> Optional[Decoder]:
""" Get the decoder in use with this model """ """ Get the decoder in use with this model """
pass pass
@decoder.setter @decoder.setter
def decoder(self, decoder: decoders.Decoder): def decoder(self, decoder: decoders.Decoder):
""" Change the decoder to use with this Tokenizer """ """ Change the decoder to use with this Tokenizer """
pass pass
@property @property
def post_processor(self) -> Optional[PostProcessor]: def post_processor(self) -> Optional[PostProcessor]:
""" Get the post-processor in use with this Tokenizer """ """ Get the post-processor in use with this Tokenizer """
pass pass
@post_processor.setter @post_processor.setter
def post_processor(self, processor: processors.PostProcessor): def post_processor(self, processor: processors.PostProcessor):
""" Change the post processor to use with this Tokenizer """ """ Change the post processor to use with this Tokenizer """
@property @property
def normalizer(self) -> Optional[Normalizer]: def normalizer(self) -> Optional[Normalizer]:
""" Get the normalizer in use with this Tokenizer """ """ Get the normalizer in use with this Tokenizer """
pass pass
@normalizer.setter @normalizer.setter
def normalizer(self, normalizer: normalizers.Normalizer): def normalizer(self, normalizer: normalizers.Normalizer):
""" Change the normalizer to use with this Tokenizer """ """ Change the normalizer to use with this Tokenizer """
def num_special_tokens_to_add(self, is_pair: bool) -> int: def num_special_tokens_to_add(self, is_pair: bool) -> int:
""" """
Return the number of special tokens that would be added for single/pair sentences. Return the number of special tokens that would be added for single/pair sentences.
@ -208,8 +187,6 @@ class Tokenizer:
:return: :return:
""" """
pass pass
def get_vocab_size(self, with_added_tokens: Optional[bool]) -> int: def get_vocab_size(self, with_added_tokens: Optional[bool]) -> int:
""" Returns the size of the vocabulary """ Returns the size of the vocabulary
@ -218,11 +195,7 @@ class Tokenizer:
Whether to include the added tokens in the vocabulary's size Whether to include the added tokens in the vocabulary's size
""" """
pass pass
def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
def enable_truncation(self,
max_length: int,
stride: Optional[int],
strategy: Optional[str]):
""" Enable the truncation """ Enable the truncation
Args: Args:
@ -237,17 +210,17 @@ class Tokenizer:
Can be one of `longest_first`, `only_first` or `only_second` Can be one of `longest_first`, `only_first` or `only_second`
""" """
pass pass
def no_truncation(self): def no_truncation(self):
""" Disable truncation """ """ Disable truncation """
pass pass
def enable_padding(
def enable_padding(self, self,
direction: Optional[str] = "right", direction: Optional[str] = "right",
pad_id: Optional[int] = 0, pad_id: Optional[int] = 0,
pad_type_id: Optional[int] = 0, pad_type_id: Optional[int] = 0,
pad_token: Optional[str] = "[PAD]", pad_token: Optional[str] = "[PAD]",
max_length: Optional[int] = None): max_length: Optional[int] = None,
):
""" Enable the padding """ Enable the padding
Args: Args:
@ -268,11 +241,9 @@ class Tokenizer:
we pad using the size of the longest sequence in a batch we pad using the size of the longest sequence in a batch
""" """
pass pass
def no_padding(self): def no_padding(self):
""" Disable padding """ """ Disable padding """
pass pass
def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding: def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
""" Encode the given sequence """ Encode the given sequence
@ -287,7 +258,6 @@ class Tokenizer:
An Encoding An Encoding
""" """
pass pass
def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]: def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
""" Encode the given sequences or pair of sequences """ Encode the given sequences or pair of sequences
@ -300,7 +270,6 @@ class Tokenizer:
A list of Encoding A list of Encoding
""" """
pass pass
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str: def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the given list of ids to a string sequence """ Decode the given list of ids to a string sequence
@ -315,10 +284,9 @@ class Tokenizer:
The decoded string The decoded string
""" """
pass pass
def decode_batch(
def decode_batch(self, self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
sequences: List[List[int]], ) -> str:
skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the list of sequences to a list of string sequences """ Decode the list of sequences to a list of string sequences
Args: Args:
@ -332,7 +300,6 @@ class Tokenizer:
A list of decoded strings A list of decoded strings
""" """
pass pass
def token_to_id(self, token: str) -> Optional[int]: def token_to_id(self, token: str) -> Optional[int]:
""" Convert the given token to its corresponding id """ Convert the given token to its corresponding id
@ -344,7 +311,6 @@ class Tokenizer:
The corresponding id if it exists, None otherwise The corresponding id if it exists, None otherwise
""" """
pass pass
def id_to_token(self, id: int) -> Optional[str]: def id_to_token(self, id: int) -> Optional[str]:
""" Convert the given token id to its corresponding string """ Convert the given token id to its corresponding string
@ -356,7 +322,6 @@ class Tokenizer:
The corresponding string if it exists, None otherwise The corresponding string if it exists, None otherwise
""" """
pass pass
def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int: def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int:
""" Add the given tokens to the vocabulary """ Add the given tokens to the vocabulary
@ -371,7 +336,6 @@ class Tokenizer:
The number of tokens that were added to the vocabulary The number of tokens that were added to the vocabulary
""" """
pass pass
def add_special_tokens(self, tokens: List[str]) -> int: def add_special_tokens(self, tokens: List[str]) -> int:
""" Add the given special tokens to the vocabulary, and treat them as special tokens. """ Add the given special tokens to the vocabulary, and treat them as special tokens.

View File

@ -2,8 +2,8 @@ from .. import Tokenizer, Encoding
from typing import List, Union, Tuple, Optional from typing import List, Union, Tuple, Optional
class BaseTokenizer:
class BaseTokenizer:
def __init__(self, tokenizer: Tokenizer, parameters=None): def __init__(self, tokenizer: Tokenizer, parameters=None):
self._tokenizer = tokenizer self._tokenizer = tokenizer
self._parameters = parameters if parameters is not None else {} self._parameters = parameters if parameters is not None else {}
@ -11,7 +11,8 @@ class BaseTokenizer:
def __repr__(self): def __repr__(self):
return "Tokenizer(vocabulary_size={}, {})".format( return "Tokenizer(vocabulary_size={}, {})".format(
self._tokenizer.get_vocab_size(), self._tokenizer.get_vocab_size(),
', '.join(k + '=' + str(v) for k, v in self._parameters.items())) ", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
)
def num_special_tokens_to_add(self, is_pair: bool) -> int: def num_special_tokens_to_add(self, is_pair: bool) -> int:
""" """
@ -33,12 +34,14 @@ class BaseTokenizer:
""" """
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens) return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
def enable_padding(self, def enable_padding(
self,
direction: Optional[str] = "right", direction: Optional[str] = "right",
pad_id: Optional[int] = 0, pad_id: Optional[int] = 0,
pad_type_id: Optional[int] = 0, pad_type_id: Optional[int] = 0,
pad_token: Optional[str] = "[PAD]", pad_token: Optional[str] = "[PAD]",
max_length: Optional[int] = None): max_length: Optional[int] = None,
):
""" Change the padding strategy """ Change the padding strategy
Args: Args:
@ -58,20 +61,21 @@ class BaseTokenizer:
If specified, the length at which to pad. If not specified If specified, the length at which to pad. If not specified
we pad using the size of the longest sequence in a batch we pad using the size of the longest sequence in a batch
""" """
return self._tokenizer.enable_padding(direction=direction, return self._tokenizer.enable_padding(
direction=direction,
pad_id=pad_id, pad_id=pad_id,
pad_type_id=pad_type_id, pad_type_id=pad_type_id,
pad_token=pad_token, pad_token=pad_token,
max_length=max_length) max_length=max_length,
)
def no_padding(self): def no_padding(self):
""" Disable padding """ """ Disable padding """
return self._tokenizer.no_padding() return self._tokenizer.no_padding()
def enable_truncation(self, def enable_truncation(
max_length: int, self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
stride: Optional[int]=0, ):
strategy: Optional[str]='longest_first'):
""" Change the truncation options """ Change the truncation options
Args: Args:
@ -85,9 +89,7 @@ class BaseTokenizer:
strategy: (`optional) str: strategy: (`optional) str:
Can be one of `longest_first`, `only_first` or `only_second` Can be one of `longest_first`, `only_first` or `only_second`
""" """
return self._tokenizer.enable_truncation(max_length, return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
stride=stride,
strategy=strategy)
def no_truncation(self): def no_truncation(self):
""" Disable truncation """ """ Disable truncation """
@ -166,9 +168,9 @@ class BaseTokenizer:
""" """
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens) return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
def decode_batch(self, def decode_batch(
sequences: List[List[int]], self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
skip_special_tokens: Optional[bool] = True) -> str: ) -> str:
""" Decode the list of sequences to a list of string sequences """ Decode the list of sequences to a list of string sequences
Args: Args:

View File

@ -5,13 +5,15 @@ from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union from typing import Optional, List, Union
class ByteLevelBPETokenizer(BaseTokenizer): class ByteLevelBPETokenizer(BaseTokenizer):
""" ByteLevelBPETokenizer """ ByteLevelBPETokenizer
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
""" """
def __init__(self, def __init__(
self,
vocab_file: Optional[str] = None, vocab_file: Optional[str] = None,
merges_file: Optional[str] = None, merges_file: Optional[str] = None,
add_prefix_space: bool = False, add_prefix_space: bool = False,
@ -19,15 +21,18 @@ class ByteLevelBPETokenizer(BaseTokenizer):
dropout: Optional[float] = None, dropout: Optional[float] = None,
unicode_normalizer: Optional[str] = None, unicode_normalizer: Optional[str] = None,
continuing_subword_prefix: Optional[str] = None, continuing_subword_prefix: Optional[str] = None,
end_of_word_suffix: Optional[str]=None end_of_word_suffix: Optional[str] = None,
): ):
if vocab_file is not None and merges_file is not None: if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer(BPE.from_files( tokenizer = Tokenizer(
vocab_file, merges_file, BPE.from_files(
vocab_file,
merges_file,
dropout=dropout, dropout=dropout,
continuing_subword_prefix=continuing_subword_prefix or "", continuing_subword_prefix=continuing_subword_prefix or "",
end_of_word_suffix=end_of_word_suffix or "", end_of_word_suffix=end_of_word_suffix or "",
)) )
)
else: else:
tokenizer = Tokenizer(BPE.empty()) tokenizer = Tokenizer(BPE.empty())
@ -47,9 +52,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
else: else:
tokenizer.normalizer = normalizers[0] tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.ByteLevel() tokenizer.decoder = decoders.ByteLevel()
parameters = { parameters = {

View File

@ -12,14 +12,16 @@ class CharBPETokenizer(BaseTokenizer):
Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909) Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
""" """
def __init__(self, def __init__(
self,
vocab_file: Optional[str] = None, vocab_file: Optional[str] = None,
merges_file: Optional[str] = None, merges_file: Optional[str] = None,
unk_token: Optional[str] = "<unk>", unk_token: Optional[str] = "<unk>",
suffix: Optional[str] = "</w>", suffix: Optional[str] = "</w>",
dropout: Optional[float] = None, dropout: Optional[float] = None,
lowercase: bool = False, lowercase: bool = False,
unicode_normalizer: Optional[str] = None): unicode_normalizer: Optional[str] = None,
):
if vocab_file is not None and merges_file is not None: if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer( tokenizer = Tokenizer(
BPE.from_files( BPE.from_files(
@ -27,7 +29,7 @@ class CharBPETokenizer(BaseTokenizer):
merges_file, merges_file,
dropout=dropout, dropout=dropout,
unk_token=unk_token, unk_token=unk_token,
end_of_word_suffix=suffix end_of_word_suffix=suffix,
) )
) )
else: else:

View File

@ -5,30 +5,31 @@ from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union from typing import Optional, List, Union
class SentencePieceBPETokenizer(BaseTokenizer): class SentencePieceBPETokenizer(BaseTokenizer):
""" SentencePiece BPE Tokenizer """ SentencePiece BPE Tokenizer
Represents the BPE algorithm, with the pretokenization used by SentencePiece Represents the BPE algorithm, with the pretokenization used by SentencePiece
""" """
def __init__(self, def __init__(
self,
vocab_file: Optional[str] = None, vocab_file: Optional[str] = None,
merges_file: Optional[str] = None, merges_file: Optional[str] = None,
unk_token: str = "<unk>", unk_token: str = "<unk>",
replacement: str = "", replacement: str = "",
add_prefix_space: bool = True, add_prefix_space: bool = True,
dropout: Optional[float]=None): dropout: Optional[float] = None,
):
if vocab_file is not None and merges_file is not None: if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer(BPE.from_files(vocab_file, tokenizer = Tokenizer(
merges_file, BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
dropout=dropout, )
unk_token=unk_token))
else: else:
tokenizer = Tokenizer(BPE.empty()) tokenizer = Tokenizer(BPE.empty())
tokenizer.add_special_tokens([unk_token]) tokenizer.add_special_tokens([unk_token])
tokenizer.normalizer = NFKC() tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space replacement=replacement, add_prefix_space=add_prefix_space

View File

@ -16,18 +16,19 @@ class Model:
""" """
pass pass
class BPE(Model): class BPE(Model):
""" BytePairEncoding model class """ """ BytePairEncoding model class """
@staticmethod @staticmethod
def from_files(vocab: str, def from_files(
vocab: str,
merges: str, merges: str,
cache_capacity: Optional[int], cache_capacity: Optional[int],
dropout: Optional[float], dropout: Optional[float],
unk_token: Optional[str], unk_token: Optional[str],
continuing_subword_prefix: Optional[str], continuing_subword_prefix: Optional[str],
end_of_word_suffix: Optional[str]) -> Model: end_of_word_suffix: Optional[str],
) -> Model:
""" Instantiate a BPE Model from the given vocab and merges files. """ Instantiate a BPE Model from the given vocab and merges files.
Args: Args:
@ -55,20 +56,18 @@ class BPE(Model):
The suffix to attach to subword units that represent an end of word. The suffix to attach to subword units that represent an end of word.
""" """
pass pass
@staticmethod @staticmethod
def empty() -> Model: def empty() -> Model:
""" Instantiate an empty BPE Model. """ """ Instantiate an empty BPE Model. """
pass pass
class WordPiece(Model): class WordPiece(Model):
""" WordPiece model class """ """ WordPiece model class """
@staticmethod @staticmethod
def from_files(vocab: str, def from_files(
unk_token: Optional[str], vocab: str, unk_token: Optional[str], max_input_chars_per_word: Optional[int]
max_input_chars_per_word: Optional[int]) -> Model: ) -> Model:
""" Instantiate a WordPiece Model from the given vocab file. """ Instantiate a WordPiece Model from the given vocab file.
Args: Args:
@ -82,13 +81,11 @@ class WordPiece(Model):
The maximum number of characters to authorize in a single word. The maximum number of characters to authorize in a single word.
""" """
pass pass
@staticmethod @staticmethod
def empty() -> Model: def empty() -> Model:
""" Instantiate an empty WordPiece Model. """ """ Instantiate an empty WordPiece Model. """
pass pass
class WordLevel(Model): class WordLevel(Model):
""" """
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id. Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.

View File

@ -11,19 +11,15 @@ Lowercase = normalizers.Lowercase
Strip = normalizers.Strip Strip = normalizers.Strip
NORMALIZERS = { NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
"nfc": NFC,
"nfd": NFD,
"nfkc": NFKC,
"nfkd": NFKD
}
def unicode_normalizer_from_str(normalizer: str) -> Normalizer: def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
if normalizer not in NORMALIZERS: if normalizer not in NORMALIZERS:
raise ValueError( raise ValueError(
"{} is not a known unicode normalizer. Available are {}" "{} is not a known unicode normalizer. Available are {}".format(
.format(normalizer, NORMALIZERS.keys()) normalizer, NORMALIZERS.keys()
)
) )
return NORMALIZERS[normalizer]() return NORMALIZERS[normalizer]()

View File

@ -98,7 +98,6 @@ class Strip(Normalizer):
def __init__(self, left: bool = True, right: bool = True) -> Normalizer: def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
pass pass
def unicode_normalizer_from_str(normalizer: str) -> Normalizer: def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
""" """
Instanciate unicode normalizer from the normalizer name Instanciate unicode normalizer from the normalizer name

View File

@ -33,7 +33,6 @@ class ByteLevel(PreTokenizer):
PreTokenizer PreTokenizer
""" """
pass pass
@staticmethod @staticmethod
def alphabet() -> List[str]: def alphabet() -> List[str]:
""" Returns the alphabet used by this PreTokenizer. """ Returns the alphabet used by this PreTokenizer.
@ -96,7 +95,6 @@ class Metaspace(PreTokenizer):
""" """
pass pass
class CharDelimiterSplit(PreTokenizer): class CharDelimiterSplit(PreTokenizer):
""" CharDelimiterSplit PreTokenizer """ CharDelimiterSplit PreTokenizer