mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 04:08:22 +00:00
Improvements on spm parity: (#401)
* Removing all pre_tokenizer logic from Unigram algorithm. * Improving *a lot* the parity check. - We can now detect a lot more errors - Special cases have been added temporarily. * Adding 2 new normalizers that mimick spm defaut's behavior. * Adding `encoding_optimized` version of the `encode` algorithm. - Removes Lattice allocation. - Changes trie `common_prefix_search` to return an iterator to avoid allocation of the full results. * Trie<char> -> Trie<u8> Another improvement on speed. * [WIP] Attempt to create a Precompiled Normalizer from SPM to be 100% compliant with arbitrary models. * Adding a new `Precompiled` Normalizer that is replacing `SpmNmtNfkc`. - It will be used for direct compatiblity with `Spm` and replace all their custom rules by using directly the normalizer spec embedded within spm files, removing all need for any rules for us. - We need `nom` dependency to parse the binary format of `spm`. - We need to add `sentencepiece_model_pb2.py` file to be able to read the proto file. - We reimplemented their `Darts::DoubleArray` compact trie format. * Fixing a bug with Precompiled normalizer. * Fixing some edge cases (now in tests) with this weird precompiled normalizer. It seems a very handy crafted trie does not prevent from shooting oneself in the foot. Sorry future reader. * Keep API stable for this PR (change of the API should come later #409). - Removed sentencepiece_model_pb2 from binding and add instructions to make `from_spm` work. * Adding model check in `from_spm`. * Adressing @n1t0's comments. * Adding a check to make sure alignments stay correct. Also added a bit more documentation on how Precompiled works. * Extracting `Precompiled` into it's own `spm_precompiled` crate. * Using ranges in `do_nmt`.
This commit is contained in:
@@ -1,6 +1,14 @@
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
AddedToken,
|
||||
pre_tokenizers,
|
||||
decoders,
|
||||
trainers,
|
||||
normalizers,
|
||||
)
|
||||
import os
|
||||
from tokenizers.models import Unigram
|
||||
from tokenizers.normalizers import NFKC
|
||||
import json
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
@@ -16,11 +24,12 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True,
|
||||
):
|
||||
if vocab is not None:
|
||||
# Let Unigram(..) fail if only one of them is None
|
||||
tokenizer = Tokenizer(Unigram(vocab))
|
||||
else:
|
||||
tokenizer = Tokenizer(Unigram())
|
||||
|
||||
tokenizer.normalizer = NFKC()
|
||||
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||
[
|
||||
pre_tokenizers.WhitespaceSplit(),
|
||||
@@ -57,3 +66,63 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
if isinstance(files, str):
|
||||
files = [files]
|
||||
self._tokenizer.train(trainer, files)
|
||||
|
||||
@staticmethod
|
||||
def from_spm(filename: str):
|
||||
try:
|
||||
import sys
|
||||
|
||||
sys.path.append(".")
|
||||
|
||||
import sentencepiece_model_pb2 as model
|
||||
except Exception:
|
||||
raise Exception(
|
||||
"You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
|
||||
)
|
||||
|
||||
m = model.ModelProto()
|
||||
m.ParseFromString(open(filename, "rb").read())
|
||||
|
||||
precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
|
||||
vocab = [(piece.piece, piece.score) for piece in m.pieces]
|
||||
unk_id = m.trainer_spec.unk_id
|
||||
model_type = m.trainer_spec.model_type
|
||||
if model_type != 1:
|
||||
raise Exception(
|
||||
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
||||
)
|
||||
|
||||
data = {"unk_id": unk_id, "vocab": vocab}
|
||||
|
||||
replacement = "▁"
|
||||
add_prefix_space = True
|
||||
|
||||
out_vocab_filename = f"{filename}.json"
|
||||
try:
|
||||
with open(out_vocab_filename, "w") as f:
|
||||
json.dump(data, f, indent=4)
|
||||
|
||||
tokenizer = Tokenizer(Unigram(out_vocab_filename))
|
||||
finally:
|
||||
os.remove(out_vocab_filename)
|
||||
|
||||
tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||
[
|
||||
pre_tokenizers.WhitespaceSplit(),
|
||||
pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
),
|
||||
]
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceUnigram",
|
||||
}
|
||||
|
||||
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
|
||||
BaseTokenizer.__init__(obj, tokenizer, parameters)
|
||||
return obj
|
||||
|
||||
@@ -9,6 +9,8 @@ NFKC = normalizers.NFKC
|
||||
Sequence = normalizers.Sequence
|
||||
Lowercase = normalizers.Lowercase
|
||||
Strip = normalizers.Strip
|
||||
Nmt = normalizers.Nmt
|
||||
Precompiled = normalizers.Precompiled
|
||||
|
||||
|
||||
NORMALIZERS = {"nfc": NFC, "nfd": NFD, "nfkc": NFKC, "nfkd": NFKD}
|
||||
|
||||
@@ -99,6 +99,18 @@ class Strip(Normalizer):
|
||||
def __init__(self, left: bool = True, right: bool = True) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Nmt(Normalizer):
|
||||
""" Nmt normalizer """
|
||||
|
||||
def __init__(self) -> Normalizer:
|
||||
pass
|
||||
|
||||
class Precompiled(Normalizer):
|
||||
""" SpmNmtNfkc normalizer """
|
||||
|
||||
def __init__(self, precompiled_charsmap: bytes) -> Normalizer:
|
||||
pass
|
||||
|
||||
def unicode_normalizer_from_str(normalizer: str) -> Normalizer:
|
||||
"""
|
||||
Instanciate unicode normalizer from the normalizer name
|
||||
|
||||
Reference in New Issue
Block a user