mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 13:48:19 +00:00
Python - remove add_special_tokens from BertWordPieceTokenizer init
This commit is contained in:
@@ -14,7 +14,6 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_file: Optional[str] = None,
|
vocab_file: Optional[str] = None,
|
||||||
add_special_tokens: bool = True,
|
|
||||||
unk_token: Union[str, AddedToken] = "[UNK]",
|
unk_token: Union[str, AddedToken] = "[UNK]",
|
||||||
sep_token: Union[str, AddedToken] = "[SEP]",
|
sep_token: Union[str, AddedToken] = "[SEP]",
|
||||||
cls_token: Union[str, AddedToken] = "[CLS]",
|
cls_token: Union[str, AddedToken] = "[CLS]",
|
||||||
@@ -52,7 +51,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
)
|
)
|
||||||
tokenizer.pre_tokenizer = BertPreTokenizer()
|
tokenizer.pre_tokenizer = BertPreTokenizer()
|
||||||
|
|
||||||
if add_special_tokens and vocab_file is not None:
|
if vocab_file is not None:
|
||||||
sep_token_id = tokenizer.token_to_id(str(sep_token))
|
sep_token_id = tokenizer.token_to_id(str(sep_token))
|
||||||
if sep_token_id is None:
|
if sep_token_id is None:
|
||||||
raise TypeError("sep_token not found in the vocabulary")
|
raise TypeError("sep_token not found in the vocabulary")
|
||||||
@@ -67,7 +66,6 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
|
|
||||||
parameters = {
|
parameters = {
|
||||||
"model": "BertWordPiece",
|
"model": "BertWordPiece",
|
||||||
"add_special_tokens": add_special_tokens,
|
|
||||||
"unk_token": unk_token,
|
"unk_token": unk_token,
|
||||||
"sep_token": sep_token,
|
"sep_token": sep_token,
|
||||||
"cls_token": cls_token,
|
"cls_token": cls_token,
|
||||||
|
|||||||
Reference in New Issue
Block a user