mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 13:48:19 +00:00
Temp work to make the APIs uniform (build from memory by default).
This commit is contained in:
@@ -5,7 +5,7 @@ from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Dict
|
||||
|
||||
|
||||
class BertWordPieceTokenizer(BaseTokenizer):
|
||||
@@ -13,7 +13,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "[UNK]",
|
||||
sep_token: Union[str, AddedToken] = "[SEP]",
|
||||
cls_token: Union[str, AddedToken] = "[CLS]",
|
||||
@@ -26,8 +26,8 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
wordpieces_prefix: str = "##",
|
||||
):
|
||||
|
||||
if vocab_file is not None:
|
||||
tokenizer = Tokenizer(WordPiece(vocab_file, unk_token=str(unk_token)))
|
||||
if vocab is not None:
|
||||
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
|
||||
else:
|
||||
tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
|
||||
|
||||
@@ -51,7 +51,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
)
|
||||
tokenizer.pre_tokenizer = BertPreTokenizer()
|
||||
|
||||
if vocab_file is not None:
|
||||
if vocab is not None:
|
||||
sep_token_id = tokenizer.token_to_id(str(sep_token))
|
||||
if sep_token_id is None:
|
||||
raise TypeError("sep_token not found in the vocabulary")
|
||||
|
||||
@@ -1,21 +1,28 @@
|
||||
from tokenizers import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers, processors
|
||||
from tokenizers import (
|
||||
Tokenizer,
|
||||
AddedToken,
|
||||
pre_tokenizers,
|
||||
decoders,
|
||||
trainers,
|
||||
processors,
|
||||
)
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
|
||||
|
||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
""" ByteLevelBPETokenizer
|
||||
"""ByteLevelBPETokenizer
|
||||
|
||||
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
add_prefix_space: bool = False,
|
||||
lowercase: bool = False,
|
||||
dropout: Optional[float] = None,
|
||||
@@ -24,11 +31,11 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
trim_offsets: bool = False,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
vocab,
|
||||
merges,
|
||||
dropout=dropout,
|
||||
continuing_subword_prefix=continuing_subword_prefix or "",
|
||||
end_of_word_suffix=end_of_word_suffix or "",
|
||||
|
||||
@@ -1,31 +1,36 @@
|
||||
from .. import Tokenizer, AddedToken, pre_tokenizers, decoders, trainers
|
||||
from ..models import BPE
|
||||
from ..normalizers import Sequence, Lowercase, unicode_normalizer_from_str, BertNormalizer
|
||||
from ..normalizers import (
|
||||
Sequence,
|
||||
Lowercase,
|
||||
unicode_normalizer_from_str,
|
||||
BertNormalizer,
|
||||
)
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
|
||||
|
||||
class CharBPETokenizer(BaseTokenizer):
|
||||
""" Original BPE Tokenizer
|
||||
"""Original BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, as introduced by Rico Sennrich
|
||||
(https://arxiv.org/abs/1508.07909)
|
||||
Represents the BPE algorithm, as introduced by Rico Sennrich
|
||||
(https://arxiv.org/abs/1508.07909)
|
||||
|
||||
The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
|
||||
Sennrich subword-nmt implementation by the following options that you can deactivate:
|
||||
- adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
|
||||
* removing any control characters and replacing all whitespaces by the classic one.
|
||||
* handle chinese chars by putting spaces around them.
|
||||
* strip all accents.
|
||||
- spitting on punctuation in addition to whitespaces (deactivate it with
|
||||
`split_on_whitespace_only=True`)
|
||||
The defaults settings corresponds to OpenAI GPT BPE tokenizers and differs from the original
|
||||
Sennrich subword-nmt implementation by the following options that you can deactivate:
|
||||
- adding a normalizer to clean up the text (deactivate with `bert_normalizer=False`) by:
|
||||
* removing any control characters and replacing all whitespaces by the classic one.
|
||||
* handle chinese chars by putting spaces around them.
|
||||
* strip all accents.
|
||||
- spitting on punctuation in addition to whitespaces (deactivate it with
|
||||
`split_on_whitespace_only=True`)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "<unk>",
|
||||
suffix: str = "</w>",
|
||||
dropout: Optional[float] = None,
|
||||
@@ -34,11 +39,11 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
bert_normalizer: bool = True,
|
||||
split_on_whitespace_only: bool = False,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(
|
||||
vocab_file,
|
||||
merges_file,
|
||||
vocab,
|
||||
merges,
|
||||
dropout=dropout,
|
||||
unk_token=str(unk_token),
|
||||
end_of_word_suffix=suffix,
|
||||
|
||||
@@ -3,28 +3,26 @@ from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List, Union
|
||||
from typing import Optional, List, Union, Dict, Tuple
|
||||
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
""" SentencePiece BPE Tokenizer
|
||||
"""SentencePiece BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_file: Optional[str] = None,
|
||||
merges_file: Optional[str] = None,
|
||||
vocab: Optional[Union[str, Dict[str, int]]] = None,
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None,
|
||||
unk_token: Union[str, AddedToken] = "<unk>",
|
||||
replacement: str = "▁",
|
||||
add_prefix_space: bool = True,
|
||||
dropout: Optional[float] = None,
|
||||
):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(
|
||||
BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)
|
||||
)
|
||||
if vocab is not None and merges is not None:
|
||||
tokenizer = Tokenizer(BPE(vocab, merges, dropout=dropout, unk_token=unk_token))
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
||||
|
||||
@@ -92,19 +92,10 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
||||
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
|
||||
)
|
||||
|
||||
data = {"unk_id": unk_id, "vocab": vocab}
|
||||
|
||||
replacement = "▁"
|
||||
add_prefix_space = True
|
||||
|
||||
out_vocab_filename = f"{filename}.json"
|
||||
try:
|
||||
with open(out_vocab_filename, "w") as f:
|
||||
json.dump(data, f, indent=4)
|
||||
|
||||
tokenizer = Tokenizer(Unigram(out_vocab_filename))
|
||||
finally:
|
||||
os.remove(out_vocab_filename)
|
||||
tokenizer = Tokenizer(Unigram(vocab, unk_id))
|
||||
|
||||
tokenizer.normalizer = normalizers.Precompiled(precompiled_charsmap)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
from .. import Encoding, Offsets, Token
|
||||
from typing import List, Optional, Union, Tuple
|
||||
from typing import List, Optional, Union, Tuple, Dict
|
||||
|
||||
class Model:
|
||||
""" Base class for all models
|
||||
@@ -32,11 +32,15 @@ class BPE(Model):
|
||||
Instantiate a BPE Model from the given vocab and merges files.
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
Path to a vocabulary JSON file.
|
||||
vocab: ('`optional`) Dict[str, int]:
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
|
||||
merges: (`optional`) string:
|
||||
Path to a merge file.
|
||||
A dictionnary of pairs of ids as keys and their merge correspondace:
|
||||
{(id_left, id_right): (importance, id_merged), .... }
|
||||
with vocab : {"a": 0, "b": 1", ... "ab": 4} the merge
|
||||
{(0, 1): (0, 4) ,...}
|
||||
corresponds to the "ab" merge, that is the most likely merge (0)
|
||||
|
||||
cache_capacity: (`optional`) int:
|
||||
The number of words that the BPE cache can contain. The cache allows
|
||||
@@ -62,8 +66,8 @@ class BPE(Model):
|
||||
@staticmethod
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[str],
|
||||
merges: Optional[str],
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]],
|
||||
cache_capacity: Optional[int],
|
||||
dropout: Optional[float],
|
||||
unk_token: Optional[str],
|
||||
@@ -80,7 +84,7 @@ class WordPiece(Model):
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
Path to a vocabulary file.
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
|
||||
unk_token: (`optional`) str:
|
||||
The unknown token to be used by the model.
|
||||
@@ -91,7 +95,7 @@ class WordPiece(Model):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[str],
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
unk_token: Optional[str],
|
||||
max_input_chars_per_word: Optional[int],
|
||||
):
|
||||
@@ -105,13 +109,13 @@ class WordLevel(Model):
|
||||
|
||||
Args:
|
||||
vocab: (`optional`) string:
|
||||
Path to a vocabulary file.
|
||||
A dictionnary of string keys and their ids {"am": 0,...}
|
||||
|
||||
unk_token: str:
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Optional[str], unk_token: Optional[str]):
|
||||
def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
|
||||
pass
|
||||
|
||||
class Unigram(Model):
|
||||
@@ -121,10 +125,10 @@ class Unigram(Model):
|
||||
|
||||
Args:
|
||||
vocab: ('`optional`) string:
|
||||
Path to a vocabulary JSON file.
|
||||
A list of vocabulary items and their relative score [("am", -0.2442),...]
|
||||
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, vocab: Optional[str]):
|
||||
def __init__(self, vocab: Optional[List[Tuple[str, float]]]):
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user