mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-24 17:19:21 +00:00
Python - Black auto formatting
This commit is contained in:
@ -7,28 +7,33 @@ parser.add_argument("--vocab", default=None, type=str, required=True, help="The
|
|||||||
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
|
parser.add_argument("--merges", default=None, type=str, required=True, help="The merges.txt file")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
class GoodCustom:
|
class GoodCustom:
|
||||||
"""GoodCustom
|
"""GoodCustom
|
||||||
This class represents a good custom PreTokenizer that will be called
|
This class represents a good custom PreTokenizer that will be called
|
||||||
by `tokenizers` when needed
|
by `tokenizers` when needed
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def pre_tokenize(self, sentence):
|
def pre_tokenize(self, sentence):
|
||||||
return sentence.split(" ")
|
return sentence.split(" ")
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
return ", ".join(tokens)
|
return ", ".join(tokens)
|
||||||
|
|
||||||
|
|
||||||
class BadCustom:
|
class BadCustom:
|
||||||
"""Bad Pretok
|
"""Bad Pretok
|
||||||
This class represents a bad custom PreTokenizer that will trigger an exception
|
This class represents a bad custom PreTokenizer that will trigger an exception
|
||||||
when called by `tokenizers`
|
when called by `tokenizers`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def pre_tokenize(self, sentence):
|
def pre_tokenize(self, sentence):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def decode(self, tokens):
|
def decode(self, tokens):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def tokenize(sentence):
|
def tokenize(sentence):
|
||||||
output = tokenizer.encode(sentence).tokens
|
output = tokenizer.encode(sentence).tokens
|
||||||
print(f"`{sentence}` tokenized to {output}")
|
print(f"`{sentence}` tokenized to {output}")
|
||||||
@ -66,4 +71,3 @@ try:
|
|||||||
encoding = tokenizer.encode("Hey friend!")
|
encoding = tokenizer.encode("Hey friend!")
|
||||||
except:
|
except:
|
||||||
print("Bad tokenizer didn't work")
|
print("Bad tokenizer didn't work")
|
||||||
|
|
||||||
|
@ -3,8 +3,9 @@ import argparse
|
|||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logging.getLogger('transformers').disabled = True
|
|
||||||
logging.getLogger('transformers.tokenization_utils').disabled = True
|
logging.getLogger("transformers").disabled = True
|
||||||
|
logging.getLogger("transformers.tokenization_utils").disabled = True
|
||||||
|
|
||||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||||
from tokenizers.models import BPE, WordPiece
|
from tokenizers.models import BPE, WordPiece
|
||||||
@ -18,7 +19,7 @@ parser.add_argument("--type", default="gpt2", type=str, help="The type of tokeni
|
|||||||
parser.add_argument("--file", default=None, type=str, help="The file to encode")
|
parser.add_argument("--file", default=None, type=str, help="The file to encode")
|
||||||
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab file")
|
parser.add_argument("--vocab", default=None, type=str, required=True, help="The vocab file")
|
||||||
parser.add_argument("--merges", default=None, type=str, help="The merges.txt file")
|
parser.add_argument("--merges", default=None, type=str, help="The merges.txt file")
|
||||||
parser.add_argument("--debug", action='store_true', help="Verbose output")
|
parser.add_argument("--debug", action="store_true", help="Verbose output")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.type == "gpt2" and args.merges is None:
|
if args.type == "gpt2" and args.merges is None:
|
||||||
@ -26,7 +27,7 @@ if args.type == "gpt2" and args.merges is None:
|
|||||||
|
|
||||||
if args.file is not None:
|
if args.file is not None:
|
||||||
with open(args.file, "r") as fp:
|
with open(args.file, "r") as fp:
|
||||||
text = [ line.strip() for line in fp ]
|
text = [line.strip() for line in fp]
|
||||||
else:
|
else:
|
||||||
text = """
|
text = """
|
||||||
The Zen of Python, by Tim Peters
|
The Zen of Python, by Tim Peters
|
||||||
@ -49,11 +50,13 @@ Although never is often better than *right* now.
|
|||||||
If the implementation is hard to explain, it's a bad idea.
|
If the implementation is hard to explain, it's a bad idea.
|
||||||
If the implementation is easy to explain, it may be a good idea.
|
If the implementation is easy to explain, it may be a good idea.
|
||||||
Namespaces are one honking great idea -- let's do more of those!
|
Namespaces are one honking great idea -- let's do more of those!
|
||||||
""".split("\n")
|
""".split(
|
||||||
|
"\n"
|
||||||
|
)
|
||||||
|
|
||||||
if args.type == "gpt2":
|
if args.type == "gpt2":
|
||||||
print("Running GPT-2 tokenizer")
|
print("Running GPT-2 tokenizer")
|
||||||
tok_p = GPT2Tokenizer.from_pretrained('gpt2')
|
tok_p = GPT2Tokenizer.from_pretrained("gpt2")
|
||||||
|
|
||||||
# Create a Tokenizer using BPE
|
# Create a Tokenizer using BPE
|
||||||
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
||||||
@ -65,33 +68,30 @@ elif args.type == "bert":
|
|||||||
print("Running Bert tokenizer")
|
print("Running Bert tokenizer")
|
||||||
tok_p = BertTokenizer.from_pretrained(args.vocab)
|
tok_p = BertTokenizer.from_pretrained(args.vocab)
|
||||||
|
|
||||||
tok_r = Tokenizer(WordPiece.from_files(
|
tok_r = Tokenizer(
|
||||||
args.vocab,
|
WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)
|
||||||
unk_token="[UNK]",
|
|
||||||
max_input_chars_per_word=100)
|
|
||||||
)
|
)
|
||||||
tok_r.normalizer = BertNormalizer(
|
tok_r.normalizer = BertNormalizer(
|
||||||
clean_text=True,
|
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
|
||||||
handle_chinese_chars=True,
|
|
||||||
strip_accents=True,
|
|
||||||
lowercase=True,
|
|
||||||
)
|
)
|
||||||
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
|
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
|
||||||
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
||||||
tok_r.decoder = decoders.WordPiece()
|
tok_r.decoder = decoders.WordPiece()
|
||||||
tok_r.post_processor = BertProcessing(
|
tok_r.post_processor = BertProcessing(
|
||||||
("[SEP]", tok_r.token_to_id("[SEP]")),
|
("[SEP]", tok_r.token_to_id("[SEP]")), ("[CLS]", tok_r.token_to_id("[CLS]")),
|
||||||
("[CLS]", tok_r.token_to_id("[CLS]")),
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise Exception(f"Unknown type {args.type}")
|
raise Exception(f"Unknown type {args.type}")
|
||||||
|
|
||||||
|
|
||||||
def tokenize_r():
|
def tokenize_r():
|
||||||
return tok_r.encode_batch(text);
|
return tok_r.encode_batch(text)
|
||||||
|
|
||||||
|
|
||||||
def tokenize_p():
|
def tokenize_p():
|
||||||
return [tok_p.encode(sentence, add_special_tokens=True) for sentence in tqdm(text)]
|
return [tok_p.encode(sentence, add_special_tokens=True) for sentence in tqdm(text)]
|
||||||
|
|
||||||
|
|
||||||
print(f"Tokenizing {len(text)} lines")
|
print(f"Tokenizing {len(text)} lines")
|
||||||
|
|
||||||
# Rust version
|
# Rust version
|
||||||
@ -110,7 +110,7 @@ print(f"Transformer tokenizer took: {time_p} sec")
|
|||||||
|
|
||||||
print(f"SpeedUp Ratio: {time_p / time_r}")
|
print(f"SpeedUp Ratio: {time_p / time_r}")
|
||||||
|
|
||||||
ids_r = [ sentence.ids for sentence in encoded_r ]
|
ids_r = [sentence.ids for sentence in encoded_r]
|
||||||
diff_ids = 0
|
diff_ids = 0
|
||||||
for i in range(0, len(encoded_r)):
|
for i in range(0, len(encoded_r)):
|
||||||
if encoded_r[i].ids != encoded_p[i]:
|
if encoded_r[i].ids != encoded_p[i]:
|
||||||
@ -124,8 +124,8 @@ for i in range(0, len(encoded_r)):
|
|||||||
print("")
|
print("")
|
||||||
print(f"Ids differences: {diff_ids}")
|
print(f"Ids differences: {diff_ids}")
|
||||||
|
|
||||||
decoded_r = tok_r.decode_batch([ sentence.ids for sentence in encoded_r ], False)
|
decoded_r = tok_r.decode_batch([sentence.ids for sentence in encoded_r], False)
|
||||||
decoded_p = [ tok_p.decode(en) for en in encoded_p ]
|
decoded_p = [tok_p.decode(en) for en in encoded_p]
|
||||||
diff_decoded = 0
|
diff_decoded = 0
|
||||||
for i in range(0, len(text)):
|
for i in range(0, len(text)):
|
||||||
if decoded_r[i] != decoded_p[i]:
|
if decoded_r[i] != decoded_p[i]:
|
||||||
|
@ -4,21 +4,24 @@ import glob
|
|||||||
from tokenizers import BertWordPieceTokenizer
|
from tokenizers import BertWordPieceTokenizer
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--files",
|
parser.add_argument(
|
||||||
|
"--files",
|
||||||
default=None,
|
default=None,
|
||||||
metavar="path",
|
metavar="path",
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
help="The files to use as training; accept '**/*.txt' type of patterns \
|
help="The files to use as training; accept '**/*.txt' type of patterns \
|
||||||
if enclosed in quotes")
|
if enclosed in quotes",
|
||||||
parser.add_argument("--out",
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--out",
|
||||||
default="./",
|
default="./",
|
||||||
type=str,
|
type=str,
|
||||||
help="Path to the output directory, where the files will be saved")
|
help="Path to the output directory, where the files will be saved",
|
||||||
parser.add_argument("--name",
|
)
|
||||||
default="bert-wordpiece",
|
parser.add_argument(
|
||||||
type=str,
|
"--name", default="bert-wordpiece", type=str, help="The name of the output vocab files"
|
||||||
help="The name of the output vocab files")
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
files = glob.glob(args.files)
|
files = glob.glob(args.files)
|
||||||
@ -29,11 +32,7 @@ if not files:
|
|||||||
|
|
||||||
# Initialize an empty tokenizer
|
# Initialize an empty tokenizer
|
||||||
tokenizer = BertWordPieceTokenizer(
|
tokenizer = BertWordPieceTokenizer(
|
||||||
clean_text=True,
|
clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True,
|
||||||
handle_chinese_chars=True,
|
|
||||||
strip_accents=True,
|
|
||||||
lowercase=True,
|
|
||||||
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# And then train
|
# And then train
|
||||||
@ -44,7 +43,7 @@ trainer = tokenizer.train(
|
|||||||
show_progress=True,
|
show_progress=True,
|
||||||
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
|
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
|
||||||
limit_alphabet=1000,
|
limit_alphabet=1000,
|
||||||
wordpieces_prefix="##"
|
wordpieces_prefix="##",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Save the files
|
# Save the files
|
||||||
|
@ -5,21 +5,24 @@ from os.path import join
|
|||||||
from tokenizers import ByteLevelBPETokenizer
|
from tokenizers import ByteLevelBPETokenizer
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--files",
|
parser.add_argument(
|
||||||
|
"--files",
|
||||||
default=None,
|
default=None,
|
||||||
metavar="path",
|
metavar="path",
|
||||||
type=str,
|
type=str,
|
||||||
required=True,
|
required=True,
|
||||||
help="The files to use as training; accept '**/*.txt' type of patterns \
|
help="The files to use as training; accept '**/*.txt' type of patterns \
|
||||||
if enclosed in quotes")
|
if enclosed in quotes",
|
||||||
parser.add_argument("--out",
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--out",
|
||||||
default="./",
|
default="./",
|
||||||
type=str,
|
type=str,
|
||||||
help="Path to the output directory, where the files will be saved")
|
help="Path to the output directory, where the files will be saved",
|
||||||
parser.add_argument("--name",
|
)
|
||||||
default="bpe-bytelevel",
|
parser.add_argument(
|
||||||
type=str,
|
"--name", default="bpe-bytelevel", type=str, help="The name of the output vocab files"
|
||||||
help="The name of the output vocab files")
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
files = glob.glob(args.files)
|
files = glob.glob(args.files)
|
||||||
@ -47,7 +50,7 @@ tokenizer.save(args.out, args.name)
|
|||||||
tokenizer = ByteLevelBPETokenizer(
|
tokenizer = ByteLevelBPETokenizer(
|
||||||
join(args.out, "{}-vocab.json".format(args.name)),
|
join(args.out, "{}-vocab.json".format(args.name)),
|
||||||
join(args.out, "{}-merges.txt".format(args.name)),
|
join(args.out, "{}-merges.txt".format(args.name)),
|
||||||
add_prefix_space=True
|
add_prefix_space=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test encoding
|
# Test encoding
|
||||||
|
@ -11,5 +11,5 @@ from .implementations import (
|
|||||||
ByteLevelBPETokenizer,
|
ByteLevelBPETokenizer,
|
||||||
CharBPETokenizer,
|
CharBPETokenizer,
|
||||||
SentencePieceBPETokenizer,
|
SentencePieceBPETokenizer,
|
||||||
BertWordPieceTokenizer
|
BertWordPieceTokenizer,
|
||||||
)
|
)
|
||||||
|
@ -9,7 +9,7 @@ from .implementations import (
|
|||||||
ByteLevelBPETokenizer as ByteLevelBPETokenizer,
|
ByteLevelBPETokenizer as ByteLevelBPETokenizer,
|
||||||
BPETokenizer as BPETokenizer,
|
BPETokenizer as BPETokenizer,
|
||||||
SentencePieceBPETokenizer as SentencePieceBPETokenizer,
|
SentencePieceBPETokenizer as SentencePieceBPETokenizer,
|
||||||
BertWordPieceTokenizer as BertWordPieceTokenizer
|
BertWordPieceTokenizer as BertWordPieceTokenizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
from typing import Optional, Union, List, Tuple
|
from typing import Optional, Union, List, Tuple
|
||||||
@ -38,27 +38,22 @@ class Encoding:
|
|||||||
def normalized_str(self) -> IndexableString:
|
def normalized_str(self) -> IndexableString:
|
||||||
""" The normalized string """
|
""" The normalized string """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def original_str(self) -> IndexableString:
|
def original_str(self) -> IndexableString:
|
||||||
""" The original string """
|
""" The original string """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def ids(self) -> List[int]:
|
def ids(self) -> List[int]:
|
||||||
""" The tokenized ids """
|
""" The tokenized ids """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tokens(self) -> List[str]:
|
def tokens(self) -> List[str]:
|
||||||
""" The tokenized strings """
|
""" The tokenized strings """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def type_ids(self) -> List[int]:
|
def type_ids(self) -> List[int]:
|
||||||
""" The type ids """
|
""" The type ids """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def offsets(self) -> List[Offsets]:
|
def offsets(self) -> List[Offsets]:
|
||||||
""" The offsets.
|
""" The offsets.
|
||||||
@ -67,28 +62,26 @@ class Encoding:
|
|||||||
method on the `original_str`.
|
method on the `original_str`.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def special_tokens_mask(self) -> List[int]:
|
def special_tokens_mask(self) -> List[int]:
|
||||||
""" The special tokens mask """
|
""" The special tokens mask """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def attention_mask(self) -> List[int]:
|
def attention_mask(self) -> List[int]:
|
||||||
""" The attention mask """
|
""" The attention mask """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def overflowing(self) -> Optional[Encoding]:
|
def overflowing(self) -> Optional[Encoding]:
|
||||||
""" The overflowing encoding, after truncation """
|
""" The overflowing encoding, after truncation """
|
||||||
pass
|
pass
|
||||||
|
def pad(
|
||||||
def pad(self,
|
self,
|
||||||
length: int,
|
length: int,
|
||||||
pad_id: Optional[int] = 0,
|
pad_id: Optional[int] = 0,
|
||||||
pad_type_id: Optional[int] = 0,
|
pad_type_id: Optional[int] = 0,
|
||||||
pad_token: Optional[str] = "[PAD]",
|
pad_token: Optional[str] = "[PAD]",
|
||||||
direction: Optional[str] = "right"):
|
direction: Optional[str] = "right",
|
||||||
|
):
|
||||||
""" Pad the current Encoding at the given length
|
""" Pad the current Encoding at the given length
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -108,7 +101,6 @@ class Encoding:
|
|||||||
The pad token to be used when padding
|
The pad token to be used when padding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def truncate(self, max_length: int, stride: Optional[int] = 0):
|
def truncate(self, max_length: int, stride: Optional[int] = 0):
|
||||||
""" Truncate the current Encoding at the given max_length
|
""" Truncate the current Encoding at the given max_length
|
||||||
|
|
||||||
@ -122,7 +114,6 @@ class Encoding:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Tokenizer:
|
class Tokenizer:
|
||||||
""" Tokenizer
|
""" Tokenizer
|
||||||
|
|
||||||
@ -151,56 +142,44 @@ class Tokenizer:
|
|||||||
Tokenizer
|
Tokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model(self) -> Model:
|
def model(self) -> Model:
|
||||||
""" Get the model in use with this Tokenizer """
|
""" Get the model in use with this Tokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@model.setter
|
@model.setter
|
||||||
def model(self, model: models.Model):
|
def model(self, model: models.Model):
|
||||||
""" Change the model to use with this Tokenizer """
|
""" Change the model to use with this Tokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def pre_tokenizer(self) -> Optional[PreTokenizer]:
|
def pre_tokenizer(self) -> Optional[PreTokenizer]:
|
||||||
""" Get the pre-tokenizer in use with this model """
|
""" Get the pre-tokenizer in use with this model """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@pre_tokenizer.setter
|
@pre_tokenizer.setter
|
||||||
def pre_tokenizer(self, pre_tokenizer: pre_tokenizers.PreTokenizer):
|
def pre_tokenizer(self, pre_tokenizer: pre_tokenizers.PreTokenizer):
|
||||||
""" Change the pre tokenizer to use with this Tokenizer """
|
""" Change the pre tokenizer to use with this Tokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def decoder(self) -> Optional[Decoder]:
|
def decoder(self) -> Optional[Decoder]:
|
||||||
""" Get the decoder in use with this model """
|
""" Get the decoder in use with this model """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@decoder.setter
|
@decoder.setter
|
||||||
def decoder(self, decoder: decoders.Decoder):
|
def decoder(self, decoder: decoders.Decoder):
|
||||||
""" Change the decoder to use with this Tokenizer """
|
""" Change the decoder to use with this Tokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def post_processor(self) -> Optional[PostProcessor]:
|
def post_processor(self) -> Optional[PostProcessor]:
|
||||||
""" Get the post-processor in use with this Tokenizer """
|
""" Get the post-processor in use with this Tokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@post_processor.setter
|
@post_processor.setter
|
||||||
def post_processor(self, processor: processors.PostProcessor):
|
def post_processor(self, processor: processors.PostProcessor):
|
||||||
""" Change the post processor to use with this Tokenizer """
|
""" Change the post processor to use with this Tokenizer """
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def normalizer(self) -> Optional[Normalizer]:
|
def normalizer(self) -> Optional[Normalizer]:
|
||||||
""" Get the normalizer in use with this Tokenizer """
|
""" Get the normalizer in use with this Tokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@normalizer.setter
|
@normalizer.setter
|
||||||
def normalizer(self, normalizer: normalizers.Normalizer):
|
def normalizer(self, normalizer: normalizers.Normalizer):
|
||||||
""" Change the normalizer to use with this Tokenizer """
|
""" Change the normalizer to use with this Tokenizer """
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
||||||
"""
|
"""
|
||||||
Return the number of special tokens that would be added for single/pair sentences.
|
Return the number of special tokens that would be added for single/pair sentences.
|
||||||
@ -208,8 +187,6 @@ class Tokenizer:
|
|||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def get_vocab_size(self, with_added_tokens: Optional[bool]) -> int:
|
def get_vocab_size(self, with_added_tokens: Optional[bool]) -> int:
|
||||||
""" Returns the size of the vocabulary
|
""" Returns the size of the vocabulary
|
||||||
|
|
||||||
@ -218,11 +195,7 @@ class Tokenizer:
|
|||||||
Whether to include the added tokens in the vocabulary's size
|
Whether to include the added tokens in the vocabulary's size
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
|
||||||
def enable_truncation(self,
|
|
||||||
max_length: int,
|
|
||||||
stride: Optional[int],
|
|
||||||
strategy: Optional[str]):
|
|
||||||
""" Enable the truncation
|
""" Enable the truncation
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -237,17 +210,17 @@ class Tokenizer:
|
|||||||
Can be one of `longest_first`, `only_first` or `only_second`
|
Can be one of `longest_first`, `only_first` or `only_second`
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def no_truncation(self):
|
def no_truncation(self):
|
||||||
""" Disable truncation """
|
""" Disable truncation """
|
||||||
pass
|
pass
|
||||||
|
def enable_padding(
|
||||||
def enable_padding(self,
|
self,
|
||||||
direction: Optional[str] = "right",
|
direction: Optional[str] = "right",
|
||||||
pad_id: Optional[int] = 0,
|
pad_id: Optional[int] = 0,
|
||||||
pad_type_id: Optional[int] = 0,
|
pad_type_id: Optional[int] = 0,
|
||||||
pad_token: Optional[str] = "[PAD]",
|
pad_token: Optional[str] = "[PAD]",
|
||||||
max_length: Optional[int] = None):
|
max_length: Optional[int] = None,
|
||||||
|
):
|
||||||
""" Enable the padding
|
""" Enable the padding
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -268,11 +241,9 @@ class Tokenizer:
|
|||||||
we pad using the size of the longest sequence in a batch
|
we pad using the size of the longest sequence in a batch
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def no_padding(self):
|
def no_padding(self):
|
||||||
""" Disable padding """
|
""" Disable padding """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
|
def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
|
||||||
""" Encode the given sequence
|
""" Encode the given sequence
|
||||||
|
|
||||||
@ -287,7 +258,6 @@ class Tokenizer:
|
|||||||
An Encoding
|
An Encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
|
def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
|
||||||
""" Encode the given sequences or pair of sequences
|
""" Encode the given sequences or pair of sequences
|
||||||
|
|
||||||
@ -300,7 +270,6 @@ class Tokenizer:
|
|||||||
A list of Encoding
|
A list of Encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||||
""" Decode the given list of ids to a string sequence
|
""" Decode the given list of ids to a string sequence
|
||||||
|
|
||||||
@ -315,10 +284,9 @@ class Tokenizer:
|
|||||||
The decoded string
|
The decoded string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
def decode_batch(
|
||||||
def decode_batch(self,
|
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
|
||||||
sequences: List[List[int]],
|
) -> str:
|
||||||
skip_special_tokens: Optional[bool] = True) -> str:
|
|
||||||
""" Decode the list of sequences to a list of string sequences
|
""" Decode the list of sequences to a list of string sequences
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -332,7 +300,6 @@ class Tokenizer:
|
|||||||
A list of decoded strings
|
A list of decoded strings
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def token_to_id(self, token: str) -> Optional[int]:
|
def token_to_id(self, token: str) -> Optional[int]:
|
||||||
""" Convert the given token to its corresponding id
|
""" Convert the given token to its corresponding id
|
||||||
|
|
||||||
@ -344,7 +311,6 @@ class Tokenizer:
|
|||||||
The corresponding id if it exists, None otherwise
|
The corresponding id if it exists, None otherwise
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def id_to_token(self, id: int) -> Optional[str]:
|
def id_to_token(self, id: int) -> Optional[str]:
|
||||||
""" Convert the given token id to its corresponding string
|
""" Convert the given token id to its corresponding string
|
||||||
|
|
||||||
@ -356,7 +322,6 @@ class Tokenizer:
|
|||||||
The corresponding string if it exists, None otherwise
|
The corresponding string if it exists, None otherwise
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int:
|
def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int:
|
||||||
""" Add the given tokens to the vocabulary
|
""" Add the given tokens to the vocabulary
|
||||||
|
|
||||||
@ -371,7 +336,6 @@ class Tokenizer:
|
|||||||
The number of tokens that were added to the vocabulary
|
The number of tokens that were added to the vocabulary
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def add_special_tokens(self, tokens: List[str]) -> int:
|
def add_special_tokens(self, tokens: List[str]) -> int:
|
||||||
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
|
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||||
|
|
||||||
|
@ -2,8 +2,8 @@ from .. import Tokenizer, Encoding
|
|||||||
|
|
||||||
from typing import List, Union, Tuple, Optional
|
from typing import List, Union, Tuple, Optional
|
||||||
|
|
||||||
class BaseTokenizer:
|
|
||||||
|
|
||||||
|
class BaseTokenizer:
|
||||||
def __init__(self, tokenizer: Tokenizer, parameters=None):
|
def __init__(self, tokenizer: Tokenizer, parameters=None):
|
||||||
self._tokenizer = tokenizer
|
self._tokenizer = tokenizer
|
||||||
self._parameters = parameters if parameters is not None else {}
|
self._parameters = parameters if parameters is not None else {}
|
||||||
@ -11,7 +11,8 @@ class BaseTokenizer:
|
|||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "Tokenizer(vocabulary_size={}, {})".format(
|
return "Tokenizer(vocabulary_size={}, {})".format(
|
||||||
self._tokenizer.get_vocab_size(),
|
self._tokenizer.get_vocab_size(),
|
||||||
', '.join(k + '=' + str(v) for k, v in self._parameters.items()))
|
", ".join(k + "=" + str(v) for k, v in self._parameters.items()),
|
||||||
|
)
|
||||||
|
|
||||||
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
def num_special_tokens_to_add(self, is_pair: bool) -> int:
|
||||||
"""
|
"""
|
||||||
@ -33,12 +34,14 @@ class BaseTokenizer:
|
|||||||
"""
|
"""
|
||||||
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
|
return self._tokenizer.get_vocab_size(with_added_tokens=with_added_tokens)
|
||||||
|
|
||||||
def enable_padding(self,
|
def enable_padding(
|
||||||
|
self,
|
||||||
direction: Optional[str] = "right",
|
direction: Optional[str] = "right",
|
||||||
pad_id: Optional[int] = 0,
|
pad_id: Optional[int] = 0,
|
||||||
pad_type_id: Optional[int] = 0,
|
pad_type_id: Optional[int] = 0,
|
||||||
pad_token: Optional[str] = "[PAD]",
|
pad_token: Optional[str] = "[PAD]",
|
||||||
max_length: Optional[int] = None):
|
max_length: Optional[int] = None,
|
||||||
|
):
|
||||||
""" Change the padding strategy
|
""" Change the padding strategy
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -58,20 +61,21 @@ class BaseTokenizer:
|
|||||||
If specified, the length at which to pad. If not specified
|
If specified, the length at which to pad. If not specified
|
||||||
we pad using the size of the longest sequence in a batch
|
we pad using the size of the longest sequence in a batch
|
||||||
"""
|
"""
|
||||||
return self._tokenizer.enable_padding(direction=direction,
|
return self._tokenizer.enable_padding(
|
||||||
|
direction=direction,
|
||||||
pad_id=pad_id,
|
pad_id=pad_id,
|
||||||
pad_type_id=pad_type_id,
|
pad_type_id=pad_type_id,
|
||||||
pad_token=pad_token,
|
pad_token=pad_token,
|
||||||
max_length=max_length)
|
max_length=max_length,
|
||||||
|
)
|
||||||
|
|
||||||
def no_padding(self):
|
def no_padding(self):
|
||||||
""" Disable padding """
|
""" Disable padding """
|
||||||
return self._tokenizer.no_padding()
|
return self._tokenizer.no_padding()
|
||||||
|
|
||||||
def enable_truncation(self,
|
def enable_truncation(
|
||||||
max_length: int,
|
self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
|
||||||
stride: Optional[int]=0,
|
):
|
||||||
strategy: Optional[str]='longest_first'):
|
|
||||||
""" Change the truncation options
|
""" Change the truncation options
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -85,9 +89,7 @@ class BaseTokenizer:
|
|||||||
strategy: (`optional) str:
|
strategy: (`optional) str:
|
||||||
Can be one of `longest_first`, `only_first` or `only_second`
|
Can be one of `longest_first`, `only_first` or `only_second`
|
||||||
"""
|
"""
|
||||||
return self._tokenizer.enable_truncation(max_length,
|
return self._tokenizer.enable_truncation(max_length, stride=stride, strategy=strategy)
|
||||||
stride=stride,
|
|
||||||
strategy=strategy)
|
|
||||||
|
|
||||||
def no_truncation(self):
|
def no_truncation(self):
|
||||||
""" Disable truncation """
|
""" Disable truncation """
|
||||||
@ -166,9 +168,9 @@ class BaseTokenizer:
|
|||||||
"""
|
"""
|
||||||
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
||||||
|
|
||||||
def decode_batch(self,
|
def decode_batch(
|
||||||
sequences: List[List[int]],
|
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
|
||||||
skip_special_tokens: Optional[bool] = True) -> str:
|
) -> str:
|
||||||
""" Decode the list of sequences to a list of string sequences
|
""" Decode the list of sequences to a list of string sequences
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -5,29 +5,34 @@ from .base_tokenizer import BaseTokenizer
|
|||||||
|
|
||||||
from typing import Optional, List, Union
|
from typing import Optional, List, Union
|
||||||
|
|
||||||
|
|
||||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||||
""" ByteLevelBPETokenizer
|
""" ByteLevelBPETokenizer
|
||||||
|
|
||||||
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
vocab_file: Optional[str]=None,
|
self,
|
||||||
merges_file: Optional[str]=None,
|
vocab_file: Optional[str] = None,
|
||||||
add_prefix_space: bool=False,
|
merges_file: Optional[str] = None,
|
||||||
lowercase: bool=False,
|
add_prefix_space: bool = False,
|
||||||
dropout: Optional[float]=None,
|
lowercase: bool = False,
|
||||||
unicode_normalizer: Optional[str]=None,
|
dropout: Optional[float] = None,
|
||||||
continuing_subword_prefix: Optional[str]=None,
|
unicode_normalizer: Optional[str] = None,
|
||||||
end_of_word_suffix: Optional[str]=None
|
continuing_subword_prefix: Optional[str] = None,
|
||||||
|
end_of_word_suffix: Optional[str] = None,
|
||||||
):
|
):
|
||||||
if vocab_file is not None and merges_file is not None:
|
if vocab_file is not None and merges_file is not None:
|
||||||
tokenizer = Tokenizer(BPE.from_files(
|
tokenizer = Tokenizer(
|
||||||
vocab_file, merges_file,
|
BPE.from_files(
|
||||||
|
vocab_file,
|
||||||
|
merges_file,
|
||||||
dropout=dropout,
|
dropout=dropout,
|
||||||
continuing_subword_prefix=continuing_subword_prefix or "",
|
continuing_subword_prefix=continuing_subword_prefix or "",
|
||||||
end_of_word_suffix=end_of_word_suffix or "",
|
end_of_word_suffix=end_of_word_suffix or "",
|
||||||
))
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
tokenizer = Tokenizer(BPE.empty())
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
|
||||||
@ -47,9 +52,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
else:
|
else:
|
||||||
tokenizer.normalizer = normalizers[0]
|
tokenizer.normalizer = normalizers[0]
|
||||||
|
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
||||||
add_prefix_space=add_prefix_space
|
|
||||||
)
|
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
|
|
||||||
parameters = {
|
parameters = {
|
||||||
|
@ -12,14 +12,16 @@ class CharBPETokenizer(BaseTokenizer):
|
|||||||
Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
|
Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
vocab_file: Optional[str]=None,
|
self,
|
||||||
merges_file: Optional[str]=None,
|
vocab_file: Optional[str] = None,
|
||||||
unk_token: Optional[str]="<unk>",
|
merges_file: Optional[str] = None,
|
||||||
suffix: Optional[str]="</w>",
|
unk_token: Optional[str] = "<unk>",
|
||||||
dropout: Optional[float]=None,
|
suffix: Optional[str] = "</w>",
|
||||||
|
dropout: Optional[float] = None,
|
||||||
lowercase: bool = False,
|
lowercase: bool = False,
|
||||||
unicode_normalizer: Optional[str] = None):
|
unicode_normalizer: Optional[str] = None,
|
||||||
|
):
|
||||||
if vocab_file is not None and merges_file is not None:
|
if vocab_file is not None and merges_file is not None:
|
||||||
tokenizer = Tokenizer(
|
tokenizer = Tokenizer(
|
||||||
BPE.from_files(
|
BPE.from_files(
|
||||||
@ -27,13 +29,13 @@ class CharBPETokenizer(BaseTokenizer):
|
|||||||
merges_file,
|
merges_file,
|
||||||
dropout=dropout,
|
dropout=dropout,
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
end_of_word_suffix=suffix
|
end_of_word_suffix=suffix,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
tokenizer = Tokenizer(BPE.empty())
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
|
||||||
tokenizer.add_special_tokens([ unk_token ])
|
tokenizer.add_special_tokens([unk_token])
|
||||||
|
|
||||||
# Check for Unicode normalization first (before everything else)
|
# Check for Unicode normalization first (before everything else)
|
||||||
normalizers = []
|
normalizers = []
|
||||||
|
@ -5,29 +5,30 @@ from .base_tokenizer import BaseTokenizer
|
|||||||
|
|
||||||
from typing import Optional, List, Union
|
from typing import Optional, List, Union
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||||
""" SentencePiece BPE Tokenizer
|
""" SentencePiece BPE Tokenizer
|
||||||
|
|
||||||
Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(
|
||||||
vocab_file: Optional[str]=None,
|
self,
|
||||||
merges_file: Optional[str]=None,
|
vocab_file: Optional[str] = None,
|
||||||
unk_token: str="<unk>",
|
merges_file: Optional[str] = None,
|
||||||
replacement: str="▁",
|
unk_token: str = "<unk>",
|
||||||
add_prefix_space: bool=True,
|
replacement: str = "▁",
|
||||||
dropout: Optional[float]=None):
|
add_prefix_space: bool = True,
|
||||||
|
dropout: Optional[float] = None,
|
||||||
|
):
|
||||||
if vocab_file is not None and merges_file is not None:
|
if vocab_file is not None and merges_file is not None:
|
||||||
tokenizer = Tokenizer(BPE.from_files(vocab_file,
|
tokenizer = Tokenizer(
|
||||||
merges_file,
|
BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
|
||||||
dropout=dropout,
|
)
|
||||||
unk_token=unk_token))
|
|
||||||
else:
|
else:
|
||||||
tokenizer = Tokenizer(BPE.empty())
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
|
||||||
tokenizer.add_special_tokens([ unk_token ])
|
tokenizer.add_special_tokens([unk_token])
|
||||||
|
|
||||||
|
|
||||||
tokenizer.normalizer = NFKC()
|
tokenizer.normalizer = NFKC()
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||||
|
@ -16,18 +16,19 @@ class Model:
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class BPE(Model):
|
class BPE(Model):
|
||||||
""" BytePairEncoding model class """
|
""" BytePairEncoding model class """
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_files(vocab: str,
|
def from_files(
|
||||||
|
vocab: str,
|
||||||
merges: str,
|
merges: str,
|
||||||
cache_capacity: Optional[int],
|
cache_capacity: Optional[int],
|
||||||
dropout: Optional[float],
|
dropout: Optional[float],
|
||||||
unk_token: Optional[str],
|
unk_token: Optional[str],
|
||||||
continuing_subword_prefix: Optional[str],
|
continuing_subword_prefix: Optional[str],
|
||||||
end_of_word_suffix: Optional[str]) -> Model:
|
end_of_word_suffix: Optional[str],
|
||||||
|
) -> Model:
|
||||||
""" Instantiate a BPE Model from the given vocab and merges files.
|
""" Instantiate a BPE Model from the given vocab and merges files.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -55,20 +56,18 @@ class BPE(Model):
|
|||||||
The suffix to attach to subword units that represent an end of word.
|
The suffix to attach to subword units that represent an end of word.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def empty() -> Model:
|
def empty() -> Model:
|
||||||
""" Instantiate an empty BPE Model. """
|
""" Instantiate an empty BPE Model. """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class WordPiece(Model):
|
class WordPiece(Model):
|
||||||
""" WordPiece model class """
|
""" WordPiece model class """
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_files(vocab: str,
|
def from_files(
|
||||||
unk_token: Optional[str],
|
vocab: str, unk_token: Optional[str], max_input_chars_per_word: Optional[int]
|
||||||
max_input_chars_per_word: Optional[int]) -> Model:
|
) -> Model:
|
||||||
""" Instantiate a WordPiece Model from the given vocab file.
|
""" Instantiate a WordPiece Model from the given vocab file.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -82,13 +81,11 @@ class WordPiece(Model):
|
|||||||
The maximum number of characters to authorize in a single word.
|
The maximum number of characters to authorize in a single word.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def empty() -> Model:
|
def empty() -> Model:
|
||||||
""" Instantiate an empty WordPiece Model. """
|
""" Instantiate an empty WordPiece Model. """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class WordLevel(Model):
|
class WordLevel(Model):
|
||||||
"""
|
"""
|
||||||
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
||||||
|
@ -11,19 +11,15 @@ Lowercase = normalizers.Lowercase
|
|||||||
Strip = normalizers.Strip
|
Strip = normalizers.Strip
|
||||||
|
|
||||||
|
|
||||||
NORMALIZERS = {
|
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||||
"nfc": NFC,
|
|
||||||
"nfd": NFD,
|
|
||||||
"nfkc": NFKC,
|
|
||||||
"nfkd": NFKD
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||||
if normalizer not in NORMALIZERS:
|
if normalizer not in NORMALIZERS:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"{} is not a known unicode normalizer. Available are {}"
|
"{} is not a known unicode normalizer. Available are {}".format(
|
||||||
.format(normalizer, NORMALIZERS.keys())
|
normalizer, NORMALIZERS.keys()
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return NORMALIZERS[normalizer]()
|
return NORMALIZERS[normalizer]()
|
@ -98,7 +98,6 @@ class Strip(Normalizer):
|
|||||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||||
"""
|
"""
|
||||||
Instanciate unicode normalizer from the normalizer name
|
Instanciate unicode normalizer from the normalizer name
|
||||||
|
@ -33,7 +33,6 @@ class ByteLevel(PreTokenizer):
|
|||||||
PreTokenizer
|
PreTokenizer
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def alphabet() -> List[str]:
|
def alphabet() -> List[str]:
|
||||||
""" Returns the alphabet used by this PreTokenizer.
|
""" Returns the alphabet used by this PreTokenizer.
|
||||||
@ -96,7 +95,6 @@ class Metaspace(PreTokenizer):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class CharDelimiterSplit(PreTokenizer):
|
class CharDelimiterSplit(PreTokenizer):
|
||||||
""" CharDelimiterSplit PreTokenizer
|
""" CharDelimiterSplit PreTokenizer
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user