Black pre-commit after rebase.

This commit is contained in:
Nicolas Patry
2020-09-23 11:37:09 +02:00
parent acd4a7599f
commit 9b1ef9d895
8 changed files with 80 additions and 97 deletions

View File

@@ -258,7 +258,7 @@ class Encoding:
@staticmethod
def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding:
""" Merge the list of Encoding into one final Encoding
"""Merge the list of Encoding into one final Encoding
Args:
encodings: List[Encoding]:
@@ -289,7 +289,7 @@ class Encoding:
pass
@property
def offsets(self) -> List[Offsets]:
""" The offsets.
"""The offsets.
These offsets can be used to index any `IndexableString` directly. If you want to
index the original `str`, make sure to retrieve the converted offsets using the `.offsets`
method on the `original_str`.
@@ -388,7 +388,7 @@ class Encoding:
pad_token: Optional[str] = "[PAD]",
direction: Optional[str] = "right",
):
""" Pad the current Encoding at the given length
"""Pad the current Encoding at the given length
Args:
length: int:
@@ -408,7 +408,7 @@ class Encoding:
"""
pass
def truncate(self, max_length: int, stride: Optional[int] = 0):
""" Truncate the current Encoding at the given max_length
"""Truncate the current Encoding at the given max_length
Args:
max_length: int:
@@ -421,7 +421,7 @@ class Encoding:
pass
class AddedToken:
""" AddedToken represents a token to be added to a Tokenizer
"""AddedToken represents a token to be added to a Tokenizer
An AddedToken can have special options defining the way it should behave.
"""
@@ -434,7 +434,7 @@ class AddedToken:
rstrip: bool = False,
normalized: bool = True,
) -> AddedToken:
""" Instantiate a new AddedToken
"""Instantiate a new AddedToken
Args:
content: str:
@@ -464,7 +464,7 @@ class AddedToken:
pass
class Tokenizer:
""" Tokenizer
"""Tokenizer
A Tokenizer works as a pipeline, it processes some raw text as input and outputs
an `Encoding`.
@@ -481,7 +481,7 @@ class Tokenizer:
"""
def __new__(cls, model: models.Model) -> Tokenizer:
""" Instantiate a new Tokenizer using the given Model
"""Instantiate a new Tokenizer using the given Model
Args:
model: models.Model:
@@ -493,7 +493,7 @@ class Tokenizer:
pass
@staticmethod
def from_str(s: str) -> Tokenizer:
""" Instantiate a new Tokenizer from the given JSON string
"""Instantiate a new Tokenizer from the given JSON string
Args:
s: str:
@@ -505,7 +505,7 @@ class Tokenizer:
pass
@staticmethod
def from_file(path: str) -> Tokenizer:
""" Instantiate a new Tokenizer from the given file
"""Instantiate a new Tokenizer from the given file
Args:
path: str:
@@ -517,7 +517,7 @@ class Tokenizer:
pass
@staticmethod
def from_buffer(buffer: bytes) -> Tokenizer:
""" Instantiate a new Tokenizer from the given buffer
"""Instantiate a new Tokenizer from the given buffer
Args:
buffer: bytes:
@@ -528,7 +528,7 @@ class Tokenizer:
"""
pass
def to_str(self, pretty: bool = False) -> str:
""" Get a serialized JSON version of the Tokenizer as a str
"""Get a serialized JSON version of the Tokenizer as a str
Args:
pretty: bool:
@@ -539,7 +539,7 @@ class Tokenizer:
"""
pass
def save(self, path: str, pretty: bool = False):
""" Save the Tokenizer as JSON to the given path
"""Save the Tokenizer as JSON to the given path
Args:
pretty: bool:
@@ -592,7 +592,7 @@ class Tokenizer:
"""
pass
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
""" Returns the vocabulary
"""Returns the vocabulary
Args:
with_added_tokens: boolean:
@@ -603,7 +603,7 @@ class Tokenizer:
"""
pass
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
""" Returns the size of the vocabulary
"""Returns the size of the vocabulary
Args:
with_added_tokens: boolean:
@@ -614,7 +614,7 @@ class Tokenizer:
"""
pass
def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
""" Enable the truncation
"""Enable the truncation
Args:
max_length: unsigned int:
@@ -633,7 +633,7 @@ class Tokenizer:
pass
@property
def truncation(self) -> Optional[dict]:
""" Get the current truncation parameters
"""Get the current truncation parameters
Returns:
None if truncation is disabled, a dict with the current truncation parameters if
@@ -649,7 +649,7 @@ class Tokenizer:
pad_token: Optional[str] = "[PAD]",
length: Optional[int] = None,
):
""" Enable the padding
"""Enable the padding
Args:
direction: (`optional`) str:
@@ -679,7 +679,7 @@ class Tokenizer:
pass
@property
def padding(self) -> Optional[dict]:
""" Get the current padding parameters
"""Get the current padding parameters
Returns:
None if padding is disabled, a dict with the currently set parameters
@@ -693,7 +693,7 @@ class Tokenizer:
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
""" Encode the given sequence and pair. This method can process raw text sequences as well
"""Encode the given sequence and pair. This method can process raw text sequences as well
as already pre-tokenized sequences.
Args:
@@ -721,7 +721,7 @@ class Tokenizer:
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
""" Encode the given inputs. This method accept both raw text sequences as well as already
"""Encode the given inputs. This method accept both raw text sequences as well as already
pre-tokenized sequences.
Args:
@@ -748,7 +748,7 @@ class Tokenizer:
"""
pass
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the given list of ids to a string sequence
"""Decode the given list of ids to a string sequence
Args:
ids: List[unsigned int]:
@@ -764,7 +764,7 @@ class Tokenizer:
def decode_batch(
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
) -> str:
""" Decode the list of sequences to a list of string sequences
"""Decode the list of sequences to a list of string sequences
Args:
sequences: List[List[unsigned int]]:
@@ -778,7 +778,7 @@ class Tokenizer:
"""
pass
def token_to_id(self, token: str) -> Optional[int]:
""" Convert the given token to its corresponding id
"""Convert the given token to its corresponding id
Args:
token: str:
@@ -789,7 +789,7 @@ class Tokenizer:
"""
pass
def id_to_token(self, id: int) -> Optional[str]:
""" Convert the given token id to its corresponding string
"""Convert the given token id to its corresponding string
Args:
token: id:
@@ -800,7 +800,7 @@ class Tokenizer:
"""
pass
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given tokens to the vocabulary
"""Add the given tokens to the vocabulary
Args:
tokens: List[Union[str, AddedToken]]:
@@ -812,7 +812,7 @@ class Tokenizer:
"""
pass
def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
The special tokens will never be processed by the model, and will be
removed while decoding.
@@ -829,7 +829,7 @@ class Tokenizer:
def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
) -> Encoding:
""" Apply all the post-processing steps to the given encodings.
"""Apply all the post-processing steps to the given encodings.
The various steps are:
1. Truncate according to global params (provided to `enable_truncation`)

View File

@@ -1,7 +1,7 @@
from typing import List
class Decoder:
""" Base class for all decoders
"""Base class for all decoders
This class is not supposed to be instantiated directly. Instead, any implementation of
a Decoder will return an instance of this class when instantiated.
@@ -23,7 +23,7 @@ class WordPiece(Decoder):
@staticmethod
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
""" Instantiate a new WordPiece Decoder
"""Instantiate a new WordPiece Decoder
Args:
prefix: str:
@@ -38,7 +38,7 @@ class Metaspace(Decoder):
""" Metaspace decoder """
def __init__(self, replacement: str = "", add_prefix_space: bool = True) -> None:
""" Instantiate a new Metaspace
"""Instantiate a new Metaspace
Args:
replacement: str:
@@ -55,7 +55,7 @@ class BPEDecoder(Decoder):
""" BPEDecoder """
def __init__(self, suffix: str = "</w>") -> None:
""" Instantiate a new BPEDecoder
"""Instantiate a new BPEDecoder
Args:
suffix: str:

View File

@@ -25,7 +25,7 @@ class BaseTokenizer:
return self._tokenizer.num_special_tokens_to_add(is_pair)
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
""" Returns the vocabulary
"""Returns the vocabulary
Args:
with_added_tokens: boolean:
@@ -37,7 +37,7 @@ class BaseTokenizer:
return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
""" Return the size of vocabulary, with or without added tokens.
"""Return the size of vocabulary, with or without added tokens.
Args:
with_added_tokens: (`optional`) bool:
@@ -57,7 +57,7 @@ class BaseTokenizer:
pad_token: Optional[str] = "[PAD]",
length: Optional[int] = None,
):
""" Change the padding strategy
"""Change the padding strategy
Args:
direction: (`optional`) str:
@@ -96,7 +96,7 @@ class BaseTokenizer:
@property
def padding(self) -> Optional[dict]:
""" Get the current padding parameters
"""Get the current padding parameters
Returns:
None if padding is disabled, a dict with the currently set parameters
@@ -107,7 +107,7 @@ class BaseTokenizer:
def enable_truncation(
self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
):
""" Change the truncation options
"""Change the truncation options
Args:
max_length: unsigned int:
@@ -128,7 +128,7 @@ class BaseTokenizer:
@property
def truncation(self) -> Optional[dict]:
""" Get the current truncation parameters
"""Get the current truncation parameters
Returns:
None if truncation is disabled, a dict with the current truncation parameters if
@@ -137,7 +137,7 @@ class BaseTokenizer:
return self._tokenizer.truncation
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given tokens to the vocabulary
"""Add the given tokens to the vocabulary
Args:
tokens: List[Union[str, AddedToken]]:
@@ -150,7 +150,7 @@ class BaseTokenizer:
return self._tokenizer.add_tokens(tokens)
def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
The special tokens will never be processed by the model, and will be
removed while decoding.
@@ -166,7 +166,7 @@ class BaseTokenizer:
return self._tokenizer.add_special_tokens(special_tokens)
def normalize(self, sequence: str) -> str:
""" Normalize the given sequence
"""Normalize the given sequence
Args:
sequence: str:
@@ -184,7 +184,7 @@ class BaseTokenizer:
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
""" Encode the given sequence and pair. This method can process raw text sequences as well
"""Encode the given sequence and pair. This method can process raw text sequences as well
as already pre-tokenized sequences.
Args:
@@ -216,7 +216,7 @@ class BaseTokenizer:
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
""" Encode the given inputs. This method accept both raw text sequences as well as already
"""Encode the given inputs. This method accept both raw text sequences as well as already
pre-tokenized sequences.
Args:
@@ -248,7 +248,7 @@ class BaseTokenizer:
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the given list of ids to a string sequence
"""Decode the given list of ids to a string sequence
Args:
ids: List[unsigned int]:
@@ -268,7 +268,7 @@ class BaseTokenizer:
def decode_batch(
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
) -> str:
""" Decode the list of sequences to a list of string sequences
"""Decode the list of sequences to a list of string sequences
Args:
sequences: List[List[unsigned int]]:
@@ -286,7 +286,7 @@ class BaseTokenizer:
return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
def token_to_id(self, token: str) -> Optional[int]:
""" Convert the given token to its corresponding id
"""Convert the given token to its corresponding id
Args:
token: str:
@@ -298,7 +298,7 @@ class BaseTokenizer:
return self._tokenizer.token_to_id(token)
def id_to_token(self, id: int) -> Optional[str]:
""" Convert the given token id to its corresponding string
"""Convert the given token id to its corresponding string
Args:
token: id:
@@ -310,7 +310,7 @@ class BaseTokenizer:
return self._tokenizer.id_to_token(id)
def save_model(self, directory: str, name: Optional[str] = None):
""" Save the current model to the given directory
"""Save the current model to the given directory
Args:
directory: str:
@@ -322,7 +322,7 @@ class BaseTokenizer:
return self._tokenizer.model.save(directory, name=name)
def save(self, path: str, pretty: bool = False):
""" Save the current Tokenizer at the given path
"""Save the current Tokenizer at the given path
Args:
path: str:
@@ -331,7 +331,7 @@ class BaseTokenizer:
return self._tokenizer.save(path, pretty)
def to_str(self, pretty: bool = False):
""" Get a serialized JSON version of the Tokenizer as a str
"""Get a serialized JSON version of the Tokenizer as a str
Args:
pretty: bool:
@@ -345,7 +345,7 @@ class BaseTokenizer:
def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
) -> Encoding:
""" Apply all the post-processing steps to the given encodings.
"""Apply all the post-processing steps to the given encodings.
The various steps are:
1. Truncate according to global params (provided to `enable_truncation`)

View File

@@ -2,7 +2,7 @@ from .. import NormalizedString
from typing import Optional, List
class Normalizer:
""" Base class for all normalizers
"""Base class for all normalizers
This class is not supposed to be instantiated directly. Instead, any implementation of a
Normalizer will return an instance of this class when instantiated.
@@ -16,7 +16,7 @@ class Normalizer:
pass
class BertNormalizer(Normalizer):
""" BertNormalizer
"""BertNormalizer
Takes care of normalizing raw text before giving it to a Bert model.
This includes cleaning the text, handling accents, chinese chars and lowercasing
@@ -29,7 +29,7 @@ class BertNormalizer(Normalizer):
strip_accents: Optional[bool] = None,
lowercase: Optional[bool] = True,
) -> None:
""" Instantiate a BertNormalizer with the given options.
"""Instantiate a BertNormalizer with the given options.
Args:
clean_text: (`optional`) boolean:
@@ -80,13 +80,13 @@ class NFKC(Normalizer):
pass
class Sequence(Normalizer):
""" Allows concatenating multiple other Normalizer as a Sequence.
"""Allows concatenating multiple other Normalizer as a Sequence.
All the normalizers run in sequence in the given order
"""
def __init__(self, normalizers: List[Normalizer]) -> None:
""" Instantiate a new normalization Sequence using the given normalizers
"""Instantiate a new normalization Sequence using the given normalizers
Args:
normalizers: List[Normalizer]:

View File

@@ -2,7 +2,7 @@ from .. import Encoding
from typing import Tuple, Union, List
class PostProcessor:
""" Base class for all post-processors
"""Base class for all post-processors
This class is not supposed to be instantiated directly. Instead, any implementation of
a PostProcessor will return an instance of this class when instantiated.
@@ -22,7 +22,7 @@ class PostProcessor:
pass
class BertProcessing(PostProcessor):
""" BertProcessing
"""BertProcessing
This post-processor takes care of adding the special tokens needed by
a Bert model:
@@ -31,7 +31,7 @@ class BertProcessing(PostProcessor):
"""
def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
""" Instantiate a new BertProcessing with the given tokens
"""Instantiate a new BertProcessing with the given tokens
Args:
sep: Tuple[str, int]:
@@ -46,7 +46,7 @@ class BertProcessing(PostProcessor):
pass
class RobertaProcessing(PostProcessor):
""" RobertaProcessing
"""RobertaProcessing
This post-processor takes care of adding the special tokens needed by
a Roberta model:
@@ -66,7 +66,7 @@ class RobertaProcessing(PostProcessor):
trim_offsets: bool = True,
add_prefix_space: bool = True,
) -> None:
""" Instantiate a new RobertaProcessing with the given tokens
"""Instantiate a new RobertaProcessing with the given tokens
Args:
sep: Tuple[str, int]:
@@ -88,7 +88,7 @@ class RobertaProcessing(PostProcessor):
pass
class ByteLevel(PostProcessor):
""" ByteLevel Post processing
"""ByteLevel Post processing
This post-processor takes care of trimming the offsets.
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
@@ -96,7 +96,7 @@ class ByteLevel(PostProcessor):
"""
def __init__(self, trim_offsets: bool = True) -> None:
""" Instantiate a new ByteLevel
"""Instantiate a new ByteLevel
Args:
trim_offsets: bool:
@@ -108,7 +108,7 @@ Template = Union[str, List[str]]
Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
class TemplateProcessing(PostProcessor):
""" TemplateProcessing
"""TemplateProcessing
Provides a way to specify templates in order to add the special tokens to each
input sequence as relevant.
@@ -143,7 +143,7 @@ class TemplateProcessing(PostProcessor):
"""
def __init__(self, seq_a: Template, seq_b: Template, special_tokens: Tokens) -> None:
""" Instantiate a new TemplateProcessing
"""Instantiate a new TemplateProcessing
Args:
seq_a: Template

View File

@@ -2,14 +2,14 @@ from .. import AddedToken
from typing import Optional, List, Union
class Trainer:
""" Base class for all trainers
"""Base class for all trainers
This class is not supposed to be instantiated directly. Instead, any implementation of a
Trainer will return an instance of this class when instantiated.
"""
class BpeTrainer(Trainer):
""" BpeTrainer
"""BpeTrainer
Capable of training a BPE model
"""
@@ -25,7 +25,7 @@ class BpeTrainer(Trainer):
continuing_subword_prefix: Optional[str] = None,
end_of_word_suffix: Optional[str] = None,
) -> None:
""" Instantiate a new BpeTrainer with the given options:
"""Instantiate a new BpeTrainer with the given options:
Args:
vocab_size: unsigned int:
@@ -61,7 +61,7 @@ class BpeTrainer(Trainer):
pass
class WordPieceTrainer(Trainer):
""" WordPieceTrainer
"""WordPieceTrainer
Capable of training a WordPiece model
"""
@@ -77,7 +77,7 @@ class WordPieceTrainer(Trainer):
continuing_subword_prefix: Optional[str] = "##",
end_of_word_suffix: Optional[str] = None,
) -> Trainer:
""" Instantiate a new WordPieceTrainer with the given options:
"""Instantiate a new WordPieceTrainer with the given options:
Args:
vocab_size: unsigned int:
@@ -113,7 +113,7 @@ class WordPieceTrainer(Trainer):
pass
class UnigramTrainer(Trainer):
""" UnigramTrainer
"""UnigramTrainer
Capable of training a Unigram model
"""
@@ -124,7 +124,7 @@ class UnigramTrainer(Trainer):
show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [],
) -> Trainer:
""" Instantiate a new UnigramTrainer with the given options:
"""Instantiate a new UnigramTrainer with the given options:
Args:
vocab_size: unsigned int: