Now spm_parity_check succeeds because we have the correct pre_tokenizer.

This commit is contained in:
Nicolas Patry
2020-08-24 14:05:34 +02:00
parent e974cfb1c9
commit dd91739ba0
2 changed files with 8 additions and 17 deletions

View File

@@ -21,8 +21,13 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
),
]
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space