mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Handle when precompiled charsmap is empty (#1308)
* Handle when precompiled charsmap is empty * Black --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -173,12 +173,15 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
|
|
||||||
tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
|
tokenizer = Tokenizer(Unigram(vocab, unk_id, byte_fallback))
|
||||||
|
|
||||||
|
if precompiled_charsmap:
|
||||||
tokenizer.normalizer = normalizers.Sequence(
|
tokenizer.normalizer = normalizers.Sequence(
|
||||||
[
|
[
|
||||||
normalizers.Precompiled(precompiled_charsmap),
|
normalizers.Precompiled(precompiled_charsmap),
|
||||||
normalizers.Replace(Regex(" {2,}"), " "),
|
normalizers.Replace(Regex(" {2,}"), " "),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
else:
|
||||||
|
tokenizer.normalizer = normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||||
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user