Failing test for compatibility for SentencePieceUnigramTokenizer.

- We are failing on ambiguous tokenization (AAA -> A + AA vs AA + A).
  Could be linked to float precision and hard or impossible to fix
(should not hinder model performance)

- We are now fusing_unk by default as it's the case with spm_train

- We are still failing on at least space deduplication. Probably should
  be handlded by a pre-tokenizer.
This commit is contained in:
Nicolas Patry
2020-08-21 14:16:50 +02:00
parent c7a84c7cc6
commit 439305eea0
11 changed files with 314 additions and 8 deletions

View File

@@ -8,7 +8,8 @@ TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]
PreTokenizedInputSequence,
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
@@ -25,5 +26,6 @@ from .implementations import (
ByteLevelBPETokenizer,
CharBPETokenizer,
SentencePieceBPETokenizer,
SentencePieceUnigramTokenizer,
BertWordPieceTokenizer,
)

View File

@@ -2,4 +2,5 @@ from .base_tokenizer import BaseTokenizer
from .byte_level_bpe import ByteLevelBPETokenizer
from .char_level_bpe import CharBPETokenizer
from .sentencepiece_bpe import SentencePieceBPETokenizer
from .sentencepiece_unigram import SentencePieceUnigramTokenizer
from .bert_wordpiece import BertWordPieceTokenizer

View File

@@ -0,0 +1,40 @@
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
from tokenizers.models import Unigram
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
from typing import Optional, List, Union
class SentencePieceUnigramTokenizer(BaseTokenizer):
"""SentencePiece Unigram Tokenizer
Represents the Unigram algorithm, with the pretokenization used by SentencePiece
"""
def __init__(
self,
vocab: Optional[str] = None,
replacement: str = "",
add_prefix_space: bool = True,
):
if vocab is not None:
tokenizer = Tokenizer(Unigram(vocab))
else:
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
parameters = {
"model": "SentencePieceUnigram",
"replacement": replacement,
"add_prefix_space": add_prefix_space,
}
super().__init__(tokenizer, parameters)

View File

@@ -6,3 +6,4 @@ Model = models.Model
BPE = models.BPE
WordPiece = models.WordPiece
WordLevel = models.WordLevel
Unigram = models.Unigram

View File

@@ -100,3 +100,25 @@ class WordLevel(Model):
def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
pass
class Unigram(Model):
"""UnigramEncoding model class
Instantiate a Unigram Model from the given model file.
Args:
vocab: ('`optional`) string:
Path to a vocabulary JSON file.
is_spm_file: ('`optional`) bool:
If the file came out of sentencepiece, we need to load it differently
"""
@staticmethod
def __init__(
self,
vocab: Optional[str],
is_spm_file: Optional[bool],
):
pass