Files
tokenizers/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py
Nicolas Patry 330876ae02 Improvements on spm parity: (#401)
* Removing all pre_tokenizer logic from Unigram algorithm.

* Improving *a lot* the parity check.

- We can now detect a lot more errors
- Special cases have been added temporarily.

* Adding 2 new normalizers that mimick spm defaut's behavior.

* Adding `encoding_optimized` version of the `encode` algorithm.

- Removes Lattice allocation.
- Changes trie `common_prefix_search` to return an iterator to avoid
  allocation of the full results.

* Trie<char> -> Trie<u8> Another improvement on speed.

* [WIP] Attempt to create a Precompiled Normalizer from SPM to be 100%
compliant with arbitrary models.

* Adding a new `Precompiled` Normalizer that is replacing `SpmNmtNfkc`.

- It will be used for direct compatiblity with `Spm` and replace all
their custom rules by using directly the normalizer spec embedded
within spm files, removing all need for any rules for us.
- We need `nom` dependency to parse the binary format of `spm`.
- We need to add `sentencepiece_model_pb2.py` file to be able to read
  the proto file.
- We reimplemented their `Darts::DoubleArray` compact trie format.

* Fixing a bug with Precompiled normalizer.

* Fixing some edge cases (now in tests) with this weird precompiled
normalizer.

It seems a very handy crafted trie does not prevent from shooting
oneself in the foot. Sorry future reader.

* Keep API stable for this PR (change of the API should come later #409).

- Removed sentencepiece_model_pb2 from binding and add instructions to
make `from_spm` work.

* Adding model check in `from_spm`.

* Adressing @n1t0's comments.

* Adding a check to make sure alignments stay correct.

Also added a bit more documentation on how Precompiled works.

* Extracting `Precompiled` into it's own `spm_precompiled` crate.

* Using ranges in `do_nmt`.
2020-09-15 22:21:02 +02:00

129 lines
4.2 KiB
Python

from tokenizers import (
Tokenizer,
AddedToken,
pre_tokenizers,
decoders,
trainers,
normalizers,
)
import os
from tokenizers.models import Unigram
import json
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union
class SentencePieceUnigramTokenizer(BaseTokenizer):
"""SentencePiece Unigram Tokenizer
Represents the Unigram algorithm, with the pretokenization used by SentencePiece
"""
def __init__(
self, vocab: Optional[str] = None, replacement: str = "", add_prefix_space: bool = True,
):
if vocab is not None:
# Let Unigram(..) fail if only one of them is None
tokenizer = Tokenizer(Unigram(vocab))
else:
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = normalizers.Sequence([normalizers.Nmt(), normalizers.NFKC(),])
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
),
]
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
parameters = {
"model": "SentencePieceUnigram",
"replacement": replacement,
"add_prefix_space": add_prefix_space,
}
super().__init__(tokenizer, parameters)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 8000,
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
):
""" Train the model using the given files """
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(trainer, files)
@staticmethod
def from_spm(filename: str):
try:
import sys
sys.path.append(".")
import sentencepiece_model_pb2 as model
except Exception:
raise Exception(
"You don't seem to have the required protobuf file, in order to use this function you need to run `pip install protobuf` and `wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_model_pb2.py` for us to be able to read the intrinsics of your spm_file. `pip install sentencepiece` is not required."
)
m = model.ModelProto()
m.ParseFromString(open(filename, "rb").read())
precompiled_charsmap = m.normalizer_spec.precompiled_charsmap
vocab = [(piece.piece, piece.score) for piece in m.pieces]
unk_id = m.trainer_spec.unk_id
model_type = m.trainer_spec.model_type
if model_type != 1:
raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)
data = {"unk_id": unk_id, "vocab": vocab}
replacement = ""
add_prefix_space = True
out_vocab_filename = f"{filename}.json"
try:
with open(out_vocab_filename, "w") as f:
json.dump(data, f, indent=4)
tokenizer = Tokenizer(Unigram(out_vocab_filename))
finally:
os.remove(out_vocab_filename)
tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[
pre_tokenizers.WhitespaceSplit(),
pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
),
]
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
parameters = {
"model": "SentencePieceUnigram",
}
obj = BaseTokenizer.__new__(SentencePieceUnigramTokenizer, tokenizer, parameters)
BaseTokenizer.__init__(obj, tokenizer, parameters)
return obj