better repr for tokenizers

This commit is contained in:
thomwolf
2020-01-08 12:06:46 +01:00
parent 111c2d152c
commit 882df9b8e2
5 changed files with 46 additions and 7 deletions

View File

@ -5,8 +5,14 @@ from typing import List, Union, Tuple, Optional
class BaseTokenizer:
_tokenizer: Tokenizer
def __init__(self, tokenizer: Tokenizer):
def __init__(self, tokenizer: Tokenizer, parameters=None):
self._tokenizer = tokenizer
self._parameters = parameters if parameters is not None else {}
def __repr__(self):
return "Tokenizer(vocabulary_size={}, {})".format(
self._tokenizer.get_vocab_size(),
', '.join(k + ': ' + str(v) for k, v in self._parameters.items()))
def with_padding(self,
direction: Optional[str] = "right",

View File

@ -20,7 +20,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
handle_chinese_chars: bool=True,
strip_accents: bool=True,
lowercase: bool=True,
prefix: str="##"):
wordpieces_prefix: str="##"):
if vocab_file is not None:
tokenizer = Tokenizer(WordPiece.from_files(vocab_file, unk_token=unk_token))
else:
@ -44,7 +45,19 @@ class BertWordPieceTokenizer(BaseTokenizer):
(sep_token, sep_token_id),
(cls_token, cls_token_id)
)
tokenizer.decoders = decoders.WordPiece.new(prefix=prefix)
tokenizer.decoders = decoders.WordPiece.new(prefix=wordpieces_prefix)
super().__init__(tokenizer)
parameters = {
"model": "BertWordPiece",
"add_special_tokens": add_special_tokens,
"unk_token": unk_token,
"sep_token": sep_token,
"cls_token": cls_token,
"clean_text": clean_text,
"handle_chinese_chars": handle_chinese_chars,
"strip_accents": strip_accents,
"lowercase": lowercase,
"wordpieces_prefix": wordpieces_prefix,
}
super().__init__(tokenizer, parameters)

View File

@ -33,4 +33,11 @@ class BPETokenizer(BaseTokenizer):
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
super().__init__(tokenizer)
parameters = {
"model": "BPE",
"unk_token": unk_token,
"suffix": suffix,
"dropout": dropout,
}
super().__init__(tokenizer, parameters)

View File

@ -24,4 +24,9 @@ class ByteLevelBPETokenizer(BaseTokenizer):
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.ByteLevel.new()
super().__init__(tokenizer)
parameters = {
"model": "ByteLevelBPE",
"add_prefix_space": add_prefix_space,
}
super().__init__(tokenizer, parameters)

View File

@ -32,4 +32,12 @@ class SentencePieceBPETokenizer(BaseTokenizer):
tokenizer.decoder = decoders.Metaspace.new(replacement=replacement,
add_prefix_space=add_prefix_space)
super().__init__(tokenizer)
parameters = {
"model": "SentencePieceBPE",
"unk_token": unk_token,
"replacement": replacement,
"add_prefix_space": add_prefix_space,
"dropout": dropout,
}
super().__init__(tokenizer, parameters)