mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add fuse_unk
option to SentencePieceBPETokenizer (#574)
* Add fuse_unk option to SentencePieceBPETokenizer * Fix style Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
@ -20,9 +20,12 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
replacement: str = "▁",
|
||||
add_prefix_space: bool = True,
|
||||
dropout: Optional[float] = None,
|
||||
fuse_unk: Optional[bool] = False,
|
||||
):
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token))
|
||||
tokenizer = Tokenizer(
|
||||
BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk)
|
||||
)
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
|
Reference in New Issue
Block a user