Add fuse_unk option to SentencePieceBPETokenizer (#574)

* Add fuse_unk option to SentencePieceBPETokenizer

* Fix style

Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
devfon
2021-01-13 06:07:59 +09:00
committed by GitHub
parent 91dae1de15
commit b9c6bea75e

View File

@ -20,9 +20,12 @@ class SentencePieceBPETokenizer(BaseTokenizer):
replacement: str = "",
add_prefix_space: bool = True,
dropout: Optional[float] = None,
fuse_unk: Optional[bool] = False,
):
if vocab is not None and merges is not None:
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token))
tokenizer = Tokenizer(
BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk)
)
else:
tokenizer = Tokenizer(BPE())