mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 19:28:20 +00:00
[FIX] In SentencePieceBPETokenizer, when Vocab or merges is None, unk_token cannot be used. (#1120)
* [fix] Use unk_token In SentencePieceBPETokenizer, when Vocab or merges is None, unk_token cannot be used. * [fix] If unk_token is None, this case is also considered. * Update bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@@ -26,7 +26,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
|||||||
if vocab is not None and merges is not None:
|
if vocab is not None and merges is not None:
|
||||||
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
|
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
|
||||||
else:
|
else:
|
||||||
tokenizer = Tokenizer(BPE())
|
tokenizer = Tokenizer(BPE(dropout=dropout, unk_token=unk_token, fuse_unk=fuse_unk))
|
||||||
|
|
||||||
if tokenizer.token_to_id(str(unk_token)) is not None:
|
if tokenizer.token_to_id(str(unk_token)) is not None:
|
||||||
tokenizer.add_special_tokens([str(unk_token)])
|
tokenizer.add_special_tokens([str(unk_token)])
|
||||||
|
|||||||
Reference in New Issue
Block a user