mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-05 12:18:20 +00:00
Failing test for compatibility for SentencePieceUnigramTokenizer.
- We are failing on ambiguous tokenization (AAA -> A + AA vs AA + A). Could be linked to float precision and hard or impossible to fix (should not hinder model performance) - We are now fusing_unk by default as it's the case with spm_train - We are still failing on at least space deduplication. Probably should be handlded by a pre-tokenizer.
This commit is contained in:
@@ -8,7 +8,8 @@ TextInputSequence = str
|
||||
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
|
||||
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
|
||||
PreTokenizedEncodeInput = Union[
|
||||
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]
|
||||
PreTokenizedInputSequence,
|
||||
Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
|
||||
]
|
||||
|
||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||
@@ -25,5 +26,6 @@ from .implementations import (
|
||||
ByteLevelBPETokenizer,
|
||||
CharBPETokenizer,
|
||||
SentencePieceBPETokenizer,
|
||||
SentencePieceUnigramTokenizer,
|
||||
BertWordPieceTokenizer,
|
||||
)
|
||||
|
||||
@@ -2,4 +2,5 @@ from .base_tokenizer import BaseTokenizer
|
||||
from .byte_level_bpe import ByteLevelBPETokenizer
|
||||
from .char_level_bpe import CharBPETokenizer
|
||||
from .sentencepiece_bpe import SentencePieceBPETokenizer
|
||||
from .sentencepiece_unigram import SentencePieceUnigramTokenizer
|
||||
from .bert_wordpiece import BertWordPieceTokenizer
|
||||
|
||||
@@ -0,0 +1,40 @@
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
|
||||
from tokenizers.models import Unigram
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
|
||||
|
||||
class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
"""SentencePiece Unigram Tokenizer
|
||||
|
||||
Represents the Unigram algorithm, with the pretokenization used by SentencePiece
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[str] = None,
|
||||
replacement: str = "▁",
|
||||
add_prefix_space: bool = True,
|
||||
):
|
||||
if vocab is not None:
|
||||
tokenizer = Tokenizer(Unigram(vocab))
|
||||
else:
|
||||
tokenizer = Tokenizer(Unigram())
|
||||
|
||||
tokenizer.normalizer = NFKC()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceUnigram",
|
||||
"replacement": replacement,
|
||||
"add_prefix_space": add_prefix_space,
|
||||
}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
@@ -6,3 +6,4 @@ Model = models.Model
|
||||
BPE = models.BPE
|
||||
WordPiece = models.WordPiece
|
||||
WordLevel = models.WordLevel
|
||||
Unigram = models.Unigram
|
||||
|
||||
@@ -100,3 +100,25 @@ class WordLevel(Model):
|
||||
|
||||
def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
"""UnigramEncoding model class
|
||||
|
||||
Instantiate a Unigram Model from the given model file.
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
Path to a vocabulary JSON file.
|
||||
|
||||
is_spm_file: ('`optional`) bool:
|
||||
If the file came out of sentencepiece, we need to load it differently
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[str],
|
||||
is_spm_file: Optional[bool],
|
||||
):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user