Handle when precompiled charsmap is empty (#1308)

* Handle when precompiled charsmap is empty

* Black

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Kelly Marchisio
2023-07-31 13:35:24 +01:00
committed by GitHub
parent c2664ae13f
commit efea6c7246

View File

@ -173,12 +173,15 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
if precompiled_charsmap:
tokenizer.normalizer = normalizers.Sequence(
[
normalizers.Precompiled(precompiled_charsmap),
normalizers.Replace(Regex(" {2,}"), " "),
]
)
else:
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)