mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
* Removing all pre_tokenizer logic from Unigram algorithm. * Improving *a lot* the parity check. - We can now detect a lot more errors - Special cases have been added temporarily. * Adding 2 new normalizers that mimick spm defaut's behavior. * Adding `encoding_optimized` version of the `encode` algorithm. - Removes Lattice allocation. - Changes trie `common_prefix_search` to return an iterator to avoid allocation of the full results. * Trie<char> -> Trie<u8> Another improvement on speed. * [WIP] Attempt to create a Precompiled Normalizer from SPM to be 100% compliant with arbitrary models. * Adding a new `Precompiled` Normalizer that is replacing `SpmNmtNfkc`. - It will be used for direct compatiblity with `Spm` and replace all their custom rules by using directly the normalizer spec embedded within spm files, removing all need for any rules for us. - We need `nom` dependency to parse the binary format of `spm`. - We need to add `sentencepiece_model_pb2.py` file to be able to read the proto file. - We reimplemented their `Darts::DoubleArray` compact trie format. * Fixing a bug with Precompiled normalizer. * Fixing some edge cases (now in tests) with this weird precompiled normalizer. It seems a very handy crafted trie does not prevent from shooting oneself in the foot. Sorry future reader. * Keep API stable for this PR (change of the API should come later #409). - Removed sentencepiece_model_pb2 from binding and add instructions to make `from_spm` work. * Adding model check in `from_spm`. * Adressing @n1t0's comments. * Adding a check to make sure alignments stay correct. Also added a bit more documentation on how Precompiled works. * Extracting `Precompiled` into it's own `spm_precompiled` crate. * Using ranges in `do_nmt`.
129 lines
4.2 KiB
Python
129 lines
4.2 KiB
Python
from tokenizers import (
|
|
Tokenizer,
|
|
AddedToken,
|
|
pre_tokenizers,
|
|
decoders,
|
|
trainers,
|
|
normalizers,
|
|
)
|
|
import os
|
|
from tokenizers.models import Unigram
|
|
import json
|
|
from .base_tokenizer import BaseTokenizer
|
|
|
|
from typing import Optional, List, Union
|
|
|
|
|
|
class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|
"""SentencePiece Unigram Tokenizer
|
|
|
|
Represents the Unigram algorithm, with the pretokenization used by SentencePiece
|
|
"""
|
|
|
|
def __init__(
|
|
self, vocab: Optional[str] = None, replacement: str = "▁", add_prefix_space: bool = True,
|
|
):
|
|
if vocab is not None:
|
|
# Let Unigram(..) fail if only one of them is None
|
|
tokenizer = Tokenizer(Unigram(vocab))
|
|
else:
|
|
tokenizer = Tokenizer(Unigram())
|
|
|
|
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
|
[
|
|
pre_tokenizers.WhitespaceSplit(),
|
|
pre_tokenizers.Metaspace(
|
|
replacement=replacement, add_prefix_space=add_prefix_space
|
|
),
|
|
]
|
|
)
|
|
tokenizer.decoder = decoders.Metaspace(
|
|
replacement=replacement, add_prefix_space=add_prefix_space
|
|
)
|
|
|
|
parameters = {
|
|
"model": "SentencePieceUnigram",
|
|
"replacement": replacement,
|
|
"add_prefix_space": add_prefix_space,
|
|
}
|
|
|
|
super().__init__(tokenizer, parameters)
|
|
|
|
def train(
|
|
self,
|
|
files: Union[str, List[str]],
|
|
vocab_size: int = 8000,
|
|
show_progress: bool = True,
|
|
special_tokens: List[Union[str, AddedToken]] = [],
|
|
):
|
|
""" Train the model using the given files """
|
|
|
|
trainer = trainers.UnigramTrainer(
|
|
vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
|
|
)
|
|
|
|
if isinstance(files, str):
|
|
files = [files]
|
|
self._tokenizer.train(trainer, files)
|
|
|
|
@staticmethod
|
|
def from_spm(filename: str):
|
|
try:
|
|
import sys
|
|
|
|
sys.path.append(".")
|
|
|
|
import sentencepiece_model_pb2 as model
|
|
except Exception:
|
|
raise Exception(
|
|
"You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
|
|
)
|
|
|
|
m = model.ModelProto()
|
|
m.ParseFromString(open(filename, "rb").read())
|
|
|
|
precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
|
|
vocab = [(piece.piece, piece.score) for piece in m.pieces]
|
|
unk_id = m.trainer_spec.unk_id
|
|
model_type = m.trainer_spec.model_type
|
|
if model_type != 1:
|
|
raise Exception(
|
|
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
|
)
|
|
|
|
data = {"unk_id": unk_id, "vocab": vocab}
|
|
|
|
replacement = "▁"
|
|
add_prefix_space = True
|
|
|
|
out_vocab_filename = f"{filename}.json"
|
|
try:
|
|
with open(out_vocab_filename, "w") as f:
|
|
json.dump(data, f, indent=4)
|
|
|
|
tokenizer = Tokenizer(Unigram(out_vocab_filename))
|
|
finally:
|
|
os.remove(out_vocab_filename)
|
|
|
|
tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
|
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
|
[
|
|
pre_tokenizers.WhitespaceSplit(),
|
|
pre_tokenizers.Metaspace(
|
|
replacement=replacement, add_prefix_space=add_prefix_space
|
|
),
|
|
]
|
|
)
|
|
tokenizer.decoder = decoders.Metaspace(
|
|
replacement=replacement, add_prefix_space=add_prefix_space
|
|
)
|
|
|
|
parameters = {
|
|
"model": "SentencePieceUnigram",
|
|
}
|
|
|
|
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
|
|
BaseTokenizer.__init__(obj, tokenizer, parameters)
|
|
return obj
|