mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 13:48:19 +00:00
New version. Staticmethods need to return a IntoPy<PyObject>
which is non trivial for PyClassInitializer. Instead I added a lower staticmethod that returns raw objects, and the `from_file(s)` methods are implemented directly in Python.
This commit is contained in:
@@ -80,6 +80,10 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def from_file(vocab: str, **kwargs):
|
||||
vocab = WordPiece.read_file(vocab)
|
||||
return BertWordPieceTokenizer(vocab, **kwargs)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
|
||||
@@ -77,6 +77,11 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_files(vocab_filename, merges_filename)
|
||||
return ByteLevelBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
|
||||
@@ -94,6 +94,11 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_files(vocab_filename, merges_filename)
|
||||
return CharBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
|
||||
@@ -47,6 +47,11 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, merges_filename: str, **kwargs):
|
||||
vocab, merges = BPE.read_files(vocab_filename, merges_filename)
|
||||
return SentencePieceBPETokenizer(vocab, merges, **kwargs)
|
||||
|
||||
def train(
|
||||
self,
|
||||
files: Union[str, List[str]],
|
||||
|
||||
@@ -62,8 +62,6 @@ class BPE(Model):
|
||||
fuse_unk: (`optional`) bool:
|
||||
Multiple unk tokens get fused into only 1
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Optional[Union[str, Dict[str, int]]],
|
||||
@@ -77,6 +75,15 @@ class BPE(Model):
|
||||
):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def read_files(vocab_filename: str, merges_filename: str) -> Tuple[Vocab, Merges]:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, merges_filename: str, **kwargs) -> BPE:
|
||||
vocab, merges = BPE.read_files(vocab_filename, merges_filename)
|
||||
return BPE(vocab, merges, **kwargs)
|
||||
|
||||
class WordPiece(Model):
|
||||
""" WordPiece model class
|
||||
|
||||
@@ -101,6 +108,15 @@ class WordPiece(Model):
|
||||
):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str) -> Tuple[Vocab]:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, **kwargs) -> WordPiece:
|
||||
vocab = WordPiece.read_files(vocab_filename)
|
||||
return WordPiece(vocab, **kwargs)
|
||||
|
||||
class WordLevel(Model):
|
||||
"""
|
||||
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
||||
@@ -118,6 +134,15 @@ class WordLevel(Model):
|
||||
def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def read_file(vocab_filename: str) -> Tuple[Vocab]:
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab_filename: str, **kwargs) -> WordLevel:
|
||||
vocab = WordLevel.read_files(vocab_filename)
|
||||
return WordLevel(vocab, **kwargs)
|
||||
|
||||
class Unigram(Model):
|
||||
"""UnigramEncoding model class
|
||||
|
||||
|
||||
Reference in New Issue
Block a user