mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Typings update
This commit is contained in:
@ -5,7 +5,7 @@ from tokenizers.pre_tokenizers import BertPreTokenizer
|
||||
from tokenizers.processors import BertProcessing
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Union
|
||||
|
||||
class BertWordPieceTokenizer(BaseTokenizer):
|
||||
""" Bert WordPiece Tokenizer """
|
||||
@ -62,7 +62,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(self, files: List[str],
|
||||
def train(self, files: Union[str, List[str]],
|
||||
vocab_size: int=30000,
|
||||
min_frequency: int=2,
|
||||
limit_alphabet: int=1000,
|
||||
|
@ -3,7 +3,7 @@ from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC, Sequence, Lowercase
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Union
|
||||
|
||||
class BPETokenizer(BaseTokenizer):
|
||||
""" Original BPE Tokenizer
|
||||
@ -42,7 +42,7 @@ class BPETokenizer(BaseTokenizer):
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(self, files: List[str],
|
||||
def train(self, files: Union[str, List[str]],
|
||||
vocab_size: int=30000,
|
||||
min_frequency: int=2,
|
||||
special_tokens: List[str]=["<unk>"],
|
||||
|
@ -3,7 +3,7 @@ from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Union
|
||||
|
||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
""" ByteLevelBPETokenizer
|
||||
@ -31,7 +31,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(self, files: List[str],
|
||||
def train(self, files: Union[str, List[str]],
|
||||
vocab_size: int=30000,
|
||||
min_frequency: int=2,
|
||||
show_progress: bool=True,
|
||||
|
@ -3,7 +3,7 @@ from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Union
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
""" SentencePiece BPE Tokenizer
|
||||
@ -42,7 +42,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
def train(self, files: List[str],
|
||||
def train(self, files: Union[str, List[str]],
|
||||
vocab_size: int=30000,
|
||||
min_frequency: int=2,
|
||||
special_tokens: List[str]=["<unk>"],
|
||||
|
Reference in New Issue
Block a user