mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Python - Typings update
This commit is contained in:
@ -5,7 +5,7 @@ from tokenizers.pre_tokenizers import BertPreTokenizer
|
|||||||
from tokenizers.processors import BertProcessing
|
from tokenizers.processors import BertProcessing
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Union
|
||||||
|
|
||||||
class BertWordPieceTokenizer(BaseTokenizer):
|
class BertWordPieceTokenizer(BaseTokenizer):
|
||||||
""" Bert WordPiece Tokenizer """
|
""" Bert WordPiece Tokenizer """
|
||||||
@ -62,7 +62,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
|||||||
|
|
||||||
super().__init__(tokenizer, parameters)
|
super().__init__(tokenizer, parameters)
|
||||||
|
|
||||||
def train(self, files: List[str],
|
def train(self, files: Union[str, List[str]],
|
||||||
vocab_size: int=30000,
|
vocab_size: int=30000,
|
||||||
min_frequency: int=2,
|
min_frequency: int=2,
|
||||||
limit_alphabet: int=1000,
|
limit_alphabet: int=1000,
|
||||||
|
@ -3,7 +3,7 @@ from tokenizers.models import BPE
|
|||||||
from tokenizers.normalizers import NFKC, Sequence, Lowercase
|
from tokenizers.normalizers import NFKC, Sequence, Lowercase
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Union
|
||||||
|
|
||||||
class BPETokenizer(BaseTokenizer):
|
class BPETokenizer(BaseTokenizer):
|
||||||
""" Original BPE Tokenizer
|
""" Original BPE Tokenizer
|
||||||
@ -42,7 +42,7 @@ class BPETokenizer(BaseTokenizer):
|
|||||||
|
|
||||||
super().__init__(tokenizer, parameters)
|
super().__init__(tokenizer, parameters)
|
||||||
|
|
||||||
def train(self, files: List[str],
|
def train(self, files: Union[str, List[str]],
|
||||||
vocab_size: int=30000,
|
vocab_size: int=30000,
|
||||||
min_frequency: int=2,
|
min_frequency: int=2,
|
||||||
special_tokens: List[str]=["<unk>"],
|
special_tokens: List[str]=["<unk>"],
|
||||||
|
@ -3,7 +3,7 @@ from tokenizers.models import BPE
|
|||||||
from tokenizers.normalizers import NFKC
|
from tokenizers.normalizers import NFKC
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Union
|
||||||
|
|
||||||
class ByteLevelBPETokenizer(BaseTokenizer):
|
class ByteLevelBPETokenizer(BaseTokenizer):
|
||||||
""" ByteLevelBPETokenizer
|
""" ByteLevelBPETokenizer
|
||||||
@ -31,7 +31,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
|
|
||||||
super().__init__(tokenizer, parameters)
|
super().__init__(tokenizer, parameters)
|
||||||
|
|
||||||
def train(self, files: List[str],
|
def train(self, files: Union[str, List[str]],
|
||||||
vocab_size: int=30000,
|
vocab_size: int=30000,
|
||||||
min_frequency: int=2,
|
min_frequency: int=2,
|
||||||
show_progress: bool=True,
|
show_progress: bool=True,
|
||||||
|
@ -3,7 +3,7 @@ from tokenizers.models import BPE
|
|||||||
from tokenizers.normalizers import NFKC
|
from tokenizers.normalizers import NFKC
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
from typing import Optional, List
|
from typing import Optional, List, Union
|
||||||
|
|
||||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||||
""" SentencePiece BPE Tokenizer
|
""" SentencePiece BPE Tokenizer
|
||||||
@ -42,7 +42,7 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
|||||||
|
|
||||||
super().__init__(tokenizer, parameters)
|
super().__init__(tokenizer, parameters)
|
||||||
|
|
||||||
def train(self, files: List[str],
|
def train(self, files: Union[str, List[str]],
|
||||||
vocab_size: int=30000,
|
vocab_size: int=30000,
|
||||||
min_frequency: int=2,
|
min_frequency: int=2,
|
||||||
special_tokens: List[str]=["<unk>"],
|
special_tokens: List[str]=["<unk>"],
|
||||||
|
Reference in New Issue
Block a user