mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-16 17:18:43 +00:00
Python - Update all typings
This commit is contained in:
@@ -1,22 +1,22 @@
|
||||
from .. import decoders
|
||||
from typing import List
|
||||
|
||||
class Decoder:
|
||||
"""Decoder
|
||||
""" Base class for all decoders
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Decoder will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def custom():
|
||||
pass
|
||||
|
||||
def decode(tokens: List[str]) -> str:
|
||||
def decode(self, tokens: List[str]) -> str:
|
||||
""" Decode the given list of string to a final string """
|
||||
pass
|
||||
|
||||
class ByteLevel:
|
||||
"""ByteLevel
|
||||
"""
|
||||
""" ByteLevel Decoder """
|
||||
|
||||
@staticmethod
|
||||
def new() -> Decoder:
|
||||
""" Instantiate a new ByteLevel Decoder """
|
||||
pass
|
||||
|
||||
class WordPiece:
|
||||
@@ -25,4 +25,5 @@ class WordPiece:
|
||||
|
||||
@staticmethod
|
||||
def new() -> Decoder:
|
||||
""" Instantiate a new WordPiece Decoder """
|
||||
pass
|
||||
|
||||
@@ -1,12 +1,41 @@
|
||||
from .. import normalizers
|
||||
from typing import Optional
|
||||
|
||||
class Normalizer:
|
||||
"""Normalizer
|
||||
""" Base class for all normalizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Normalizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
class BertNormalizer:
|
||||
"""BertNormalizer
|
||||
""" BertNormalizer
|
||||
|
||||
Takes care of normalizing raw text before giving it to a Bert model.
|
||||
This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
"""
|
||||
|
||||
def new() -> Normalizer:
|
||||
@staticmethod
|
||||
def new(clean_text: Optional[bool]=True,
|
||||
handle_chinese_chars: Optional[bool]=True,
|
||||
strip_accents: Optional[bool]=True,
|
||||
lowercase: Optional[bool]=True) -> Normalizer:
|
||||
""" Instantiate a BertNormalizer with the given options.
|
||||
|
||||
Args:
|
||||
clean_text: (`optional`) boolean:
|
||||
Whether to clean the text, by removing any control characters
|
||||
and replacing all whitespaces by the classic one.
|
||||
|
||||
handle_chinese_chars: (`optional`) boolean:
|
||||
Whether to handle chinese chars by putting spaces around them.
|
||||
|
||||
strip_accents: (`optional`) boolean:
|
||||
Whether to strip all accents.
|
||||
|
||||
lowercase: (`optional`) boolean:
|
||||
Whether to lowercase.
|
||||
|
||||
Returns:
|
||||
Normalizer
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,38 +1,68 @@
|
||||
from .. import pre_tokenizers
|
||||
from typing import Optional, List, Tuple
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
||||
class PreTokenizer:
|
||||
"""PreTokenizer
|
||||
""" Base class for all pre-tokenizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
PreTokenizer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]:
|
||||
""" Pre tokenize the given sequence """
|
||||
pass
|
||||
|
||||
class ByteLevel:
|
||||
"""ByteLevel
|
||||
""" ByteLevel PreTokenizer
|
||||
|
||||
This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
with a corresponding representation, as well as splitting into words.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> PreTokenizer:
|
||||
def new(add_prefix_space: Optional[bool]=True) -> PreTokenizer:
|
||||
""" Instantiate a new ByteLevel PreTokenizer
|
||||
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether a space should be added at the very beginning of the sequence
|
||||
if there isn't one already.
|
||||
|
||||
Returns:
|
||||
PreTokenizer
|
||||
"""
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def alphabet() -> List[str]:
|
||||
""" Returns the alphabet used by this PreTokenizer.
|
||||
|
||||
Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
encodes any byte to one visible character. This means that there is a
|
||||
total of 256 different characters composing this alphabet.
|
||||
"""
|
||||
pass
|
||||
|
||||
class Whitespace:
|
||||
"""Whitespace
|
||||
""" Whitespace PreTokenizer
|
||||
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> PreTokenizer:
|
||||
""" Instantiate a new Whitespace PreTokenizer """
|
||||
pass
|
||||
|
||||
class BertPreTokenizer:
|
||||
"""BertPreTokenizer
|
||||
""" BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> PreTokenizer:
|
||||
""" Instantiate a new BertPreTokenizer """
|
||||
pass
|
||||
|
||||
@@ -1,13 +1,33 @@
|
||||
from .. import processors
|
||||
from typing import Tuple
|
||||
|
||||
class PostProcessor:
|
||||
"""PostProcessor
|
||||
""" Base class for all post-processors
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a PostProcessor will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
class BertProcessing:
|
||||
"""BertProcessing
|
||||
""" BertProcessing
|
||||
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Bert model:
|
||||
- a SEP token
|
||||
- a CLS token
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new(sep: Tuple[str, int], cls: Tuple[str, int]) -> PostProcessor:
|
||||
""" Instantiate a new BertProcessing with the given tokens
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
A tuple with the string representation of the SEP token, and its id
|
||||
|
||||
cls: Tuple[str, int]:
|
||||
A tuple with the string representation of the CLS token, and its id
|
||||
|
||||
Returns:
|
||||
PostProcessor
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -1,21 +1,108 @@
|
||||
from .. import trainers
|
||||
from typing import Optional, List
|
||||
|
||||
class Trainer:
|
||||
"""Trainer
|
||||
""" Base class for all trainers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Trainer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
class BpeTrainer:
|
||||
"""BpeTrainer
|
||||
""" BpeTrainer
|
||||
|
||||
Capable of training a BPE model
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> Trainer:
|
||||
def new(vocab_size: int=30000,
|
||||
min_frequency: int=0,
|
||||
show_progress: bool=True,
|
||||
special_tokens: List[str]=[],
|
||||
limit_alphabet: Optional[int]=None,
|
||||
initial_alphabet: List[str]=[],
|
||||
continuing_subword_prefix: Optional[str]=None,
|
||||
end_of_word_suffix: Optional[str]=None) -> Trainer:
|
||||
""" Instantiate a new BpeTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[str]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contains more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
pass
|
||||
|
||||
class WordPieceTrainer:
|
||||
"""WordPieceTrainer
|
||||
""" WordPieceTrainer
|
||||
|
||||
Capable of training a WordPiece model
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> Trainer:
|
||||
def new(vocab_size: int=30000,
|
||||
min_frequency: int=0,
|
||||
show_progress: bool=True,
|
||||
special_tokens: List[str]=[],
|
||||
limit_alphabet: Optional[int]=None,
|
||||
initial_alphabet: List[str]=[],
|
||||
continuing_subword_prefix: Optional[str]="##",
|
||||
end_of_word_suffix: Optional[str]=None) -> Trainer:
|
||||
""" Instantiate a new WordPieceTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
The size of the final vocabulary, including all tokens and alphabet.
|
||||
|
||||
min_frequency: unsigned int:
|
||||
The minimum frequency a pair should have in order to be merged.
|
||||
|
||||
show_progress: boolean:
|
||||
Whether to show progress bars while training.
|
||||
|
||||
special_tokens: List[str]:
|
||||
A list of special tokens the model should know of.
|
||||
|
||||
limit_alphabet: unsigned int:
|
||||
The maximum different characters to keep in the alphabet.
|
||||
|
||||
initial_alphabet: List[str]:
|
||||
A list of characters to include in the initial alphabet, even
|
||||
if not seen in the training dataset.
|
||||
If the strings contains more than one character, only the first one
|
||||
is kept.
|
||||
|
||||
continuing_subword_prefix: Optional[str]:
|
||||
A prefix to be used for every subword that is not a beginning-of-word.
|
||||
|
||||
end_of_word_suffix: Optional[str]:
|
||||
A suffix to be used for every subword that is a end-of-word.
|
||||
|
||||
Returns:
|
||||
Trainer
|
||||
"""
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user