mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 05:38:23 +00:00
Fix SentencePiece tokenizers conversion
This commit is contained in:
@@ -1,11 +1,4 @@
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
AddedToken,
|
||||
pre_tokenizers,
|
||||
decoders,
|
||||
trainers,
|
||||
normalizers,
|
||||
)
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, normalizers, Regex
|
||||
import os
|
||||
from tokenizers.models import Unigram
|
||||
import json
|
||||
@@ -33,18 +26,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
tokenizer = Tokenizer(Unigram())
|
||||
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
normalizers.Nmt(),
|
||||
normalizers.NFKC(),
|
||||
]
|
||||
[normalizers.Nmt(), normalizers.NFKC(), normalizers.Replace(Regex(" {2,}"), " ")]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||
[
|
||||
pre_tokenizers.WhitespaceSplit(),
|
||||
pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
),
|
||||
]
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
@@ -124,15 +109,15 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
|
||||
tokenizer = Tokenizer(Unigram(vocab, unk_id))
|
||||
|
||||
tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||
tokenizer.normalizer = normalizers.Sequence(
|
||||
[
|
||||
pre_tokenizers.WhitespaceSplit(),
|
||||
pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
),
|
||||
normalizers.Precompiled(precompiled_charsmap),
|
||||
normalizers.Replace(Regex(" {2,}"), " "),
|
||||
]
|
||||
)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user