From 9b155b572398a80f3e3af2b4fa6ff34f3c79dd63 Mon Sep 17 00:00:00 2001 From: SeongBeomLEE <2712qwer@naver.com> Date: Tue, 27 Dec 2022 19:13:52 +0900 Subject: [PATCH] [FIX] In CharBPETokenizer, when Vocab or merges is None, unk_token cannot be used. (#1136) * [fix] Use unk_token In SentencePieceBPETokenizer, when Vocab or merges is None, unk_token cannot be used. * [fix] If unk_token is None, this case is also considered. * Update bindings/python/py_src/tokenizers/implementations/sentencepiece_bpe.py Co-authored-by: Nicolas Patry * [FIX] In CharBPETokenizer, Use unk_token. In CharBPETokenizer, when Vocab or merges is None, unk_token cannot be used. * Update bindings/python/py_src/tokenizers/implementations/char_level_bpe.py Co-authored-by: Nicolas Patry * Update bindings/python/py_src/tokenizers/implementations/char_level_bpe.py Co-authored-by: Nicolas Patry Co-authored-by: Nicolas Patry --- .../python/py_src/tokenizers/implementations/char_level_bpe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py b/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py index 864892de..29ca5977 100644 --- a/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py +++ b/bindings/python/py_src/tokenizers/implementations/char_level_bpe.py @@ -45,7 +45,7 @@ class CharBPETokenizer(BaseTokenizer): ) ) else: - tokenizer = Tokenizer(BPE()) + tokenizer = Tokenizer(BPE(unk_token=str(unk_token), dropout=dropout, end_of_word_suffix=suffix)) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)])