mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-03 07:49:22 +00:00
better repr for tokenizers
This commit is contained in:
@ -5,8 +5,14 @@ from typing import List, Union, Tuple, Optional
|
||||
class BaseTokenizer:
|
||||
_tokenizer: Tokenizer
|
||||
|
||||
def __init__(self, tokenizer: Tokenizer):
|
||||
def __init__(self, tokenizer: Tokenizer, parameters=None):
|
||||
self._tokenizer = tokenizer
|
||||
self._parameters = parameters if parameters is not None else {}
|
||||
|
||||
def __repr__(self):
|
||||
return "Tokenizer(vocabulary_size={}, {})".format(
|
||||
self._tokenizer.get_vocab_size(),
|
||||
', '.join(k + ': ' + str(v) for k, v in self._parameters.items()))
|
||||
|
||||
def with_padding(self,
|
||||
direction: Optional[str] = "right",
|
||||
|
@ -20,7 +20,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
handle_chinese_chars: bool=True,
|
||||
strip_accents: bool=True,
|
||||
lowercase: bool=True,
|
||||
prefix: str="##"):
|
||||
wordpieces_prefix: str="##"):
|
||||
|
||||
if vocab_file is not None:
|
||||
tokenizer = Tokenizer(WordPiece.from_files(vocab_file, unk_token=unk_token))
|
||||
else:
|
||||
@ -44,7 +45,19 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
(sep_token, sep_token_id),
|
||||
(cls_token, cls_token_id)
|
||||
)
|
||||
tokenizer.decoders = decoders.WordPiece.new(prefix=prefix)
|
||||
tokenizer.decoders = decoders.WordPiece.new(prefix=wordpieces_prefix)
|
||||
|
||||
super().__init__(tokenizer)
|
||||
parameters = {
|
||||
"model": "BertWordPiece",
|
||||
"add_special_tokens": add_special_tokens,
|
||||
"unk_token": unk_token,
|
||||
"sep_token": sep_token,
|
||||
"cls_token": cls_token,
|
||||
"clean_text": clean_text,
|
||||
"handle_chinese_chars": handle_chinese_chars,
|
||||
"strip_accents": strip_accents,
|
||||
"lowercase": lowercase,
|
||||
"wordpieces_prefix": wordpieces_prefix,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
@ -33,4 +33,11 @@ class BPETokenizer(BaseTokenizer):
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
||||
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
||||
|
||||
super().__init__(tokenizer)
|
||||
parameters = {
|
||||
"model": "BPE",
|
||||
"unk_token": unk_token,
|
||||
"suffix": suffix,
|
||||
"dropout": dropout,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
@ -24,4 +24,9 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.ByteLevel.new()
|
||||
|
||||
super().__init__(tokenizer)
|
||||
parameters = {
|
||||
"model": "ByteLevelBPE",
|
||||
"add_prefix_space": add_prefix_space,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
@ -32,4 +32,12 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
tokenizer.decoder = decoders.Metaspace.new(replacement=replacement,
|
||||
add_prefix_space=add_prefix_space)
|
||||
|
||||
super().__init__(tokenizer)
|
||||
parameters = {
|
||||
"model": "SentencePieceBPE",
|
||||
"unk_token": unk_token,
|
||||
"replacement": replacement,
|
||||
"add_prefix_space": add_prefix_space,
|
||||
"dropout": dropout,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
Reference in New Issue
Block a user