Python - Update all typings

This commit is contained in:
Anthony MOI
2020-01-06 20:03:00 -05:00
parent 0079a7a6b7
commit 772d0680b6
5 changed files with 195 additions and 28 deletions

View File

@@ -1,22 +1,22 @@
from .. import decoders
from typing import List
class Decoder:
"""Decoder
""" Base class for all decoders
This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated.
"""
@staticmethod
def custom():
pass
def decode(tokens: List[str]) -> str:
def decode(self, tokens: List[str]) -> str:
""" Decode the given list of string to a final string """
pass
class ByteLevel:
"""ByteLevel
"""
""" ByteLevel Decoder """
@staticmethod
def new() -> Decoder:
""" Instantiate a new ByteLevel Decoder """
pass
class WordPiece:
@@ -25,4 +25,5 @@ class WordPiece:
@staticmethod
def new() -> Decoder:
""" Instantiate a new WordPiece Decoder """
pass

View File

@@ -1,12 +1,41 @@
from .. import normalizers
from typing import Optional
class Normalizer:
"""Normalizer
""" Base class for all normalizers
This class is not supposed to be instantiated directly. Instead, any implementation of a
Normalizer will return an instance of this class when instantiated.
"""
class BertNormalizer:
"""BertNormalizer
""" BertNormalizer
Takes care of normalizing raw text before giving it to a Bert model.
This includes cleaning the text, handling accents, chinese chars and lowercasing
"""
def new() -> Normalizer:
@staticmethod
def new(clean_text: Optional[bool]=True,
handle_chinese_chars: Optional[bool]=True,
strip_accents: Optional[bool]=True,
lowercase: Optional[bool]=True) -> Normalizer:
""" Instantiate a BertNormalizer with the given options.
Args:
clean_text: (`optional`) boolean:
Whether to clean the text, by removing any control characters
and replacing all whitespaces by the classic one.
handle_chinese_chars: (`optional`) boolean:
Whether to handle chinese chars by putting spaces around them.
strip_accents: (`optional`) boolean:
Whether to strip all accents.
lowercase: (`optional`) boolean:
Whether to lowercase.
Returns:
Normalizer
"""
pass

View File

@@ -1,38 +1,68 @@
from .. import pre_tokenizers
from typing import Optional, List, Tuple
Offsets = Tuple[int, int]
class PreTokenizer:
"""PreTokenizer
""" Base class for all pre-tokenizers
This class is not supposed to be instantiated directly. Instead, any implementation of a
PreTokenizer will return an instance of this class when instantiated.
"""
def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]:
""" Pre tokenize the given sequence """
pass
class ByteLevel:
"""ByteLevel
""" ByteLevel PreTokenizer
This pre-tokenizer takes care of replacing all bytes of the given string
with a corresponding representation, as well as splitting into words.
"""
@staticmethod
def new() -> PreTokenizer:
def new(add_prefix_space: Optional[bool]=True) -> PreTokenizer:
""" Instantiate a new ByteLevel PreTokenizer
Args:
add_prefix_space: (`optional`) boolean:
Whether a space should be added at the very beginning of the sequence
if there isn't one already.
Returns:
PreTokenizer
"""
pass
@staticmethod
def alphabet() -> List[str]:
""" Returns the alphabet used by this PreTokenizer.
Since the ByteLevel works as its name suggests, at the byte level, it
encodes any byte to one visible character. This means that there is a
total of 256 different characters composing this alphabet.
"""
pass
class Whitespace:
"""Whitespace
""" Whitespace PreTokenizer
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
@staticmethod
def new() -> PreTokenizer:
""" Instantiate a new Whitespace PreTokenizer """
pass
class BertPreTokenizer:
"""BertPreTokenizer
""" BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately.
"""
@staticmethod
def new() -> PreTokenizer:
""" Instantiate a new BertPreTokenizer """
pass

View File

@@ -1,13 +1,33 @@
from .. import processors
from typing import Tuple
class PostProcessor:
"""PostProcessor
""" Base class for all post-processors
This class is not supposed to be instantiated directly. Instead, any implementation of
a PostProcessor will return an instance of this class when instantiated.
"""
class BertProcessing:
"""BertProcessing
""" BertProcessing
This post-processor takes care of adding the special tokens needed by
a Bert model:
- a SEP token
- a CLS token
"""
@staticmethod
def new(sep: Tuple[str, int], cls: Tuple[str, int]) -> PostProcessor:
""" Instantiate a new BertProcessing with the given tokens
Args:
sep: Tuple[str, int]:
A tuple with the string representation of the SEP token, and its id
cls: Tuple[str, int]:
A tuple with the string representation of the CLS token, and its id
Returns:
PostProcessor
"""
pass

View File

@@ -1,21 +1,108 @@
from .. import trainers
from typing import Optional, List
class Trainer:
"""Trainer
""" Base class for all trainers
This class is not supposed to be instantiated directly. Instead, any implementation of a
Trainer will return an instance of this class when instantiated.
"""
class BpeTrainer:
"""BpeTrainer
""" BpeTrainer
Capable of training a BPE model
"""
@staticmethod
def new() -> Trainer:
def new(vocab_size: int=30000,
min_frequency: int=0,
show_progress: bool=True,
special_tokens: List[str]=[],
limit_alphabet: Optional[int]=None,
initial_alphabet: List[str]=[],
continuing_subword_prefix: Optional[str]=None,
end_of_word_suffix: Optional[str]=None) -> Trainer:
""" Instantiate a new BpeTrainer with the given options:
Args:
vocab_size: unsigned int:
The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int:
The minimum frequency a pair should have in order to be merged.
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[str]:
A list of special tokens the model should know of.
limit_alphabet: unsigned int:
The maximum different characters to keep in the alphabet.
initial_alphabet: List[str]:
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contains more than one character, only the first one
is kept.
continuing_subword_prefix: Optional[str]:
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix: Optional[str]:
A suffix to be used for every subword that is a end-of-word.
Returns:
Trainer
"""
pass
class WordPieceTrainer:
"""WordPieceTrainer
""" WordPieceTrainer
Capable of training a WordPiece model
"""
@staticmethod
def new() -> Trainer:
def new(vocab_size: int=30000,
min_frequency: int=0,
show_progress: bool=True,
special_tokens: List[str]=[],
limit_alphabet: Optional[int]=None,
initial_alphabet: List[str]=[],
continuing_subword_prefix: Optional[str]="##",
end_of_word_suffix: Optional[str]=None) -> Trainer:
""" Instantiate a new WordPieceTrainer with the given options:
Args:
vocab_size: unsigned int:
The size of the final vocabulary, including all tokens and alphabet.
min_frequency: unsigned int:
The minimum frequency a pair should have in order to be merged.
show_progress: boolean:
Whether to show progress bars while training.
special_tokens: List[str]:
A list of special tokens the model should know of.
limit_alphabet: unsigned int:
The maximum different characters to keep in the alphabet.
initial_alphabet: List[str]:
A list of characters to include in the initial alphabet, even
if not seen in the training dataset.
If the strings contains more than one character, only the first one
is kept.
continuing_subword_prefix: Optional[str]:
A prefix to be used for every subword that is not a beginning-of-word.
end_of_word_suffix: Optional[str]:
A suffix to be used for every subword that is a end-of-word.
Returns:
Trainer
"""
pass