mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Handle when precompiled charsmap is empty (#1308)
* Handle when precompiled charsmap is empty * Black --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -173,12 +173,15 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
|
||||
tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
|
||||
|
||||
if precompiled_charsmap:
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
normalizers.Precompiled(precompiled_charsmap),
|
||||
normalizers.Replace(Regex(" {2,}"), " "),
|
||||
]
|
||||
)
|
||||
else:
|
||||
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||
|
||||
|
Reference in New Issue
Block a user