mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-31 12:39:21 +00:00
Add fuse_unk
option to SentencePieceBPETokenizer (#574)
* Add fuse_unk option to SentencePieceBPETokenizer * Fix style Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
@ -20,9 +20,12 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
|||||||
replacement: str = "▁",
|
replacement: str = "▁",
|
||||||
add_prefix_space: bool = True,
|
add_prefix_space: bool = True,
|
||||||
dropout: Optional[float] = None,
|
dropout: Optional[float] = None,
|
||||||
|
fuse_unk: Optional[bool] = False,
|
||||||
):
|
):
|
||||||
if vocab is not None and merges is not None:
|
if vocab is not None and merges is not None:
|
||||||
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token))
|
tokenizer = Tokenizer(
|
||||||
|
BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
tokenizer = Tokenizer(BPE())
|
tokenizer = Tokenizer(BPE())
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user