mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 21:58:18 +00:00
Black pre-commit after rebase.
This commit is contained in:
@@ -258,7 +258,7 @@ class Encoding:
|
||||
|
||||
@staticmethod
|
||||
def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding:
|
||||
""" Merge the list of Encoding into one final Encoding
|
||||
"""Merge the list of Encoding into one final Encoding
|
||||
|
||||
Args:
|
||||
encodings: List[Encoding]:
|
||||
@@ -289,7 +289,7 @@ class Encoding:
|
||||
pass
|
||||
@property
|
||||
def offsets(self) -> List[Offsets]:
|
||||
""" The offsets.
|
||||
"""The offsets.
|
||||
These offsets can be used to index any `IndexableString` directly. If you want to
|
||||
index the original `str`, make sure to retrieve the converted offsets using the `.offsets`
|
||||
method on the `original_str`.
|
||||
@@ -388,7 +388,7 @@ class Encoding:
|
||||
pad_token: Optional[str] = "[PAD]",
|
||||
direction: Optional[str] = "right",
|
||||
):
|
||||
""" Pad the current Encoding at the given length
|
||||
"""Pad the current Encoding at the given length
|
||||
|
||||
Args:
|
||||
length: int:
|
||||
@@ -408,7 +408,7 @@ class Encoding:
|
||||
"""
|
||||
pass
|
||||
def truncate(self, max_length: int, stride: Optional[int] = 0):
|
||||
""" Truncate the current Encoding at the given max_length
|
||||
"""Truncate the current Encoding at the given max_length
|
||||
|
||||
Args:
|
||||
max_length: int:
|
||||
@@ -421,7 +421,7 @@ class Encoding:
|
||||
pass
|
||||
|
||||
class AddedToken:
|
||||
""" AddedToken represents a token to be added to a Tokenizer
|
||||
"""AddedToken represents a token to be added to a Tokenizer
|
||||
|
||||
An AddedToken can have special options defining the way it should behave.
|
||||
"""
|
||||
@@ -434,7 +434,7 @@ class AddedToken:
|
||||
rstrip: bool = False,
|
||||
normalized: bool = True,
|
||||
) -> AddedToken:
|
||||
""" Instantiate a new AddedToken
|
||||
"""Instantiate a new AddedToken
|
||||
|
||||
Args:
|
||||
content: str:
|
||||
@@ -464,7 +464,7 @@ class AddedToken:
|
||||
pass
|
||||
|
||||
class Tokenizer:
|
||||
""" Tokenizer
|
||||
"""Tokenizer
|
||||
|
||||
A Tokenizer works as a pipeline, it processes some raw text as input and outputs
|
||||
an `Encoding`.
|
||||
@@ -481,7 +481,7 @@ class Tokenizer:
|
||||
"""
|
||||
|
||||
def __new__(cls, model: models.Model) -> Tokenizer:
|
||||
""" Instantiate a new Tokenizer using the given Model
|
||||
"""Instantiate a new Tokenizer using the given Model
|
||||
|
||||
Args:
|
||||
model: models.Model:
|
||||
@@ -493,7 +493,7 @@ class Tokenizer:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_str(s: str) -> Tokenizer:
|
||||
""" Instantiate a new Tokenizer from the given JSON string
|
||||
"""Instantiate a new Tokenizer from the given JSON string
|
||||
|
||||
Args:
|
||||
s: str:
|
||||
@@ -505,7 +505,7 @@ class Tokenizer:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_file(path: str) -> Tokenizer:
|
||||
""" Instantiate a new Tokenizer from the given file
|
||||
"""Instantiate a new Tokenizer from the given file
|
||||
|
||||
Args:
|
||||
path: str:
|
||||
@@ -517,7 +517,7 @@ class Tokenizer:
|
||||
pass
|
||||
@staticmethod
|
||||
def from_buffer(buffer: bytes) -> Tokenizer:
|
||||
""" Instantiate a new Tokenizer from the given buffer
|
||||
"""Instantiate a new Tokenizer from the given buffer
|
||||
|
||||
Args:
|
||||
buffer: bytes:
|
||||
@@ -528,7 +528,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def to_str(self, pretty: bool = False) -> str:
|
||||
""" Get a serialized JSON version of the Tokenizer as a str
|
||||
"""Get a serialized JSON version of the Tokenizer as a str
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
@@ -539,7 +539,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def save(self, path: str, pretty: bool = False):
|
||||
""" Save the Tokenizer as JSON to the given path
|
||||
"""Save the Tokenizer as JSON to the given path
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
@@ -592,7 +592,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
|
||||
""" Returns the vocabulary
|
||||
"""Returns the vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
@@ -603,7 +603,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
|
||||
""" Returns the size of the vocabulary
|
||||
"""Returns the size of the vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
@@ -614,7 +614,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]):
|
||||
""" Enable the truncation
|
||||
"""Enable the truncation
|
||||
|
||||
Args:
|
||||
max_length: unsigned int:
|
||||
@@ -633,7 +633,7 @@ class Tokenizer:
|
||||
pass
|
||||
@property
|
||||
def truncation(self) -> Optional[dict]:
|
||||
""" Get the current truncation parameters
|
||||
"""Get the current truncation parameters
|
||||
|
||||
Returns:
|
||||
None if truncation is disabled, a dict with the current truncation parameters if
|
||||
@@ -649,7 +649,7 @@ class Tokenizer:
|
||||
pad_token: Optional[str] = "[PAD]",
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
""" Enable the padding
|
||||
"""Enable the padding
|
||||
|
||||
Args:
|
||||
direction: (`optional`) str:
|
||||
@@ -679,7 +679,7 @@ class Tokenizer:
|
||||
pass
|
||||
@property
|
||||
def padding(self) -> Optional[dict]:
|
||||
""" Get the current padding parameters
|
||||
"""Get the current padding parameters
|
||||
|
||||
Returns:
|
||||
None if padding is disabled, a dict with the currently set parameters
|
||||
@@ -693,7 +693,7 @@ class Tokenizer:
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
""" Encode the given sequence and pair. This method can process raw text sequences as well
|
||||
"""Encode the given sequence and pair. This method can process raw text sequences as well
|
||||
as already pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
@@ -721,7 +721,7 @@ class Tokenizer:
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> List[Encoding]:
|
||||
""" Encode the given inputs. This method accept both raw text sequences as well as already
|
||||
"""Encode the given inputs. This method accept both raw text sequences as well as already
|
||||
pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
@@ -748,7 +748,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
""" Decode the given list of ids to a string sequence
|
||||
"""Decode the given list of ids to a string sequence
|
||||
|
||||
Args:
|
||||
ids: List[unsigned int]:
|
||||
@@ -764,7 +764,7 @@ class Tokenizer:
|
||||
def decode_batch(
|
||||
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
|
||||
) -> str:
|
||||
""" Decode the list of sequences to a list of string sequences
|
||||
"""Decode the list of sequences to a list of string sequences
|
||||
|
||||
Args:
|
||||
sequences: List[List[unsigned int]]:
|
||||
@@ -778,7 +778,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
""" Convert the given token to its corresponding id
|
||||
"""Convert the given token to its corresponding id
|
||||
|
||||
Args:
|
||||
token: str:
|
||||
@@ -789,7 +789,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
""" Convert the given token id to its corresponding string
|
||||
"""Convert the given token id to its corresponding string
|
||||
|
||||
Args:
|
||||
token: id:
|
||||
@@ -800,7 +800,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
""" Add the given tokens to the vocabulary
|
||||
"""Add the given tokens to the vocabulary
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
@@ -812,7 +812,7 @@ class Tokenizer:
|
||||
"""
|
||||
pass
|
||||
def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
|
||||
The special tokens will never be processed by the model, and will be
|
||||
removed while decoding.
|
||||
@@ -829,7 +829,7 @@ class Tokenizer:
|
||||
def post_process(
|
||||
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
""" Apply all the post-processing steps to the given encodings.
|
||||
"""Apply all the post-processing steps to the given encodings.
|
||||
|
||||
The various steps are:
|
||||
1. Truncate according to global params (provided to `enable_truncation`)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from typing import List
|
||||
|
||||
class Decoder:
|
||||
""" Base class for all decoders
|
||||
"""Base class for all decoders
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a Decoder will return an instance of this class when instantiated.
|
||||
@@ -23,7 +23,7 @@ class WordPiece(Decoder):
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
|
||||
""" Instantiate a new WordPiece Decoder
|
||||
"""Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
@@ -38,7 +38,7 @@ class Metaspace(Decoder):
|
||||
""" Metaspace decoder """
|
||||
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new Metaspace
|
||||
"""Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
@@ -55,7 +55,7 @@ class BPEDecoder(Decoder):
|
||||
""" BPEDecoder """
|
||||
|
||||
def __init__(self, suffix: str = "</w>") -> None:
|
||||
""" Instantiate a new BPEDecoder
|
||||
"""Instantiate a new BPEDecoder
|
||||
|
||||
Args:
|
||||
suffix: str:
|
||||
|
||||
@@ -25,7 +25,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.num_special_tokens_to_add(is_pair)
|
||||
|
||||
def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]:
|
||||
""" Returns the vocabulary
|
||||
"""Returns the vocabulary
|
||||
|
||||
Args:
|
||||
with_added_tokens: boolean:
|
||||
@@ -37,7 +37,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens)
|
||||
|
||||
def get_vocab_size(self, with_added_tokens: bool = True) -> int:
|
||||
""" Return the size of vocabulary, with or without added tokens.
|
||||
"""Return the size of vocabulary, with or without added tokens.
|
||||
|
||||
Args:
|
||||
with_added_tokens: (`optional`) bool:
|
||||
@@ -57,7 +57,7 @@ class BaseTokenizer:
|
||||
pad_token: Optional[str] = "[PAD]",
|
||||
length: Optional[int] = None,
|
||||
):
|
||||
""" Change the padding strategy
|
||||
"""Change the padding strategy
|
||||
|
||||
Args:
|
||||
direction: (`optional`) str:
|
||||
@@ -96,7 +96,7 @@ class BaseTokenizer:
|
||||
|
||||
@property
|
||||
def padding(self) -> Optional[dict]:
|
||||
""" Get the current padding parameters
|
||||
"""Get the current padding parameters
|
||||
|
||||
Returns:
|
||||
None if padding is disabled, a dict with the currently set parameters
|
||||
@@ -107,7 +107,7 @@ class BaseTokenizer:
|
||||
def enable_truncation(
|
||||
self, max_length: int, stride: Optional[int] = 0, strategy: Optional[str] = "longest_first"
|
||||
):
|
||||
""" Change the truncation options
|
||||
"""Change the truncation options
|
||||
|
||||
Args:
|
||||
max_length: unsigned int:
|
||||
@@ -128,7 +128,7 @@ class BaseTokenizer:
|
||||
|
||||
@property
|
||||
def truncation(self) -> Optional[dict]:
|
||||
""" Get the current truncation parameters
|
||||
"""Get the current truncation parameters
|
||||
|
||||
Returns:
|
||||
None if truncation is disabled, a dict with the current truncation parameters if
|
||||
@@ -137,7 +137,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.truncation
|
||||
|
||||
def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int:
|
||||
""" Add the given tokens to the vocabulary
|
||||
"""Add the given tokens to the vocabulary
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, AddedToken]]:
|
||||
@@ -150,7 +150,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.add_tokens(tokens)
|
||||
|
||||
def add_special_tokens(self, special_tokens: List[Union[str, AddedToken]]) -> int:
|
||||
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
"""Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
|
||||
The special tokens will never be processed by the model, and will be
|
||||
removed while decoding.
|
||||
@@ -166,7 +166,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.add_special_tokens(special_tokens)
|
||||
|
||||
def normalize(self, sequence: str) -> str:
|
||||
""" Normalize the given sequence
|
||||
"""Normalize the given sequence
|
||||
|
||||
Args:
|
||||
sequence: str:
|
||||
@@ -184,7 +184,7 @@ class BaseTokenizer:
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> Encoding:
|
||||
""" Encode the given sequence and pair. This method can process raw text sequences as well
|
||||
"""Encode the given sequence and pair. This method can process raw text sequences as well
|
||||
as already pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
@@ -216,7 +216,7 @@ class BaseTokenizer:
|
||||
is_pretokenized: bool = False,
|
||||
add_special_tokens: bool = True,
|
||||
) -> List[Encoding]:
|
||||
""" Encode the given inputs. This method accept both raw text sequences as well as already
|
||||
"""Encode the given inputs. This method accept both raw text sequences as well as already
|
||||
pre-tokenized sequences.
|
||||
|
||||
Args:
|
||||
@@ -248,7 +248,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
|
||||
|
||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
""" Decode the given list of ids to a string sequence
|
||||
"""Decode the given list of ids to a string sequence
|
||||
|
||||
Args:
|
||||
ids: List[unsigned int]:
|
||||
@@ -268,7 +268,7 @@ class BaseTokenizer:
|
||||
def decode_batch(
|
||||
self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True
|
||||
) -> str:
|
||||
""" Decode the list of sequences to a list of string sequences
|
||||
"""Decode the list of sequences to a list of string sequences
|
||||
|
||||
Args:
|
||||
sequences: List[List[unsigned int]]:
|
||||
@@ -286,7 +286,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
""" Convert the given token to its corresponding id
|
||||
"""Convert the given token to its corresponding id
|
||||
|
||||
Args:
|
||||
token: str:
|
||||
@@ -298,7 +298,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.token_to_id(token)
|
||||
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
""" Convert the given token id to its corresponding string
|
||||
"""Convert the given token id to its corresponding string
|
||||
|
||||
Args:
|
||||
token: id:
|
||||
@@ -310,7 +310,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.id_to_token(id)
|
||||
|
||||
def save_model(self, directory: str, name: Optional[str] = None):
|
||||
""" Save the current model to the given directory
|
||||
"""Save the current model to the given directory
|
||||
|
||||
Args:
|
||||
directory: str:
|
||||
@@ -322,7 +322,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.model.save(directory, name=name)
|
||||
|
||||
def save(self, path: str, pretty: bool = False):
|
||||
""" Save the current Tokenizer at the given path
|
||||
"""Save the current Tokenizer at the given path
|
||||
|
||||
Args:
|
||||
path: str:
|
||||
@@ -331,7 +331,7 @@ class BaseTokenizer:
|
||||
return self._tokenizer.save(path, pretty)
|
||||
|
||||
def to_str(self, pretty: bool = False):
|
||||
""" Get a serialized JSON version of the Tokenizer as a str
|
||||
"""Get a serialized JSON version of the Tokenizer as a str
|
||||
|
||||
Args:
|
||||
pretty: bool:
|
||||
@@ -345,7 +345,7 @@ class BaseTokenizer:
|
||||
def post_process(
|
||||
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
|
||||
) -> Encoding:
|
||||
""" Apply all the post-processing steps to the given encodings.
|
||||
"""Apply all the post-processing steps to the given encodings.
|
||||
|
||||
The various steps are:
|
||||
1. Truncate according to global params (provided to `enable_truncation`)
|
||||
|
||||
@@ -2,7 +2,7 @@ from .. import NormalizedString
|
||||
from typing import Optional, List
|
||||
|
||||
class Normalizer:
|
||||
""" Base class for all normalizers
|
||||
"""Base class for all normalizers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Normalizer will return an instance of this class when instantiated.
|
||||
@@ -16,7 +16,7 @@ class Normalizer:
|
||||
pass
|
||||
|
||||
class BertNormalizer(Normalizer):
|
||||
""" BertNormalizer
|
||||
"""BertNormalizer
|
||||
|
||||
Takes care of normalizing raw text before giving it to a Bert model.
|
||||
This includes cleaning the text, handling accents, chinese chars and lowercasing
|
||||
@@ -29,7 +29,7 @@ class BertNormalizer(Normalizer):
|
||||
strip_accents: Optional[bool] = None,
|
||||
lowercase: Optional[bool] = True,
|
||||
) -> None:
|
||||
""" Instantiate a BertNormalizer with the given options.
|
||||
"""Instantiate a BertNormalizer with the given options.
|
||||
|
||||
Args:
|
||||
clean_text: (`optional`) boolean:
|
||||
@@ -80,13 +80,13 @@ class NFKC(Normalizer):
|
||||
pass
|
||||
|
||||
class Sequence(Normalizer):
|
||||
""" Allows concatenating multiple other Normalizer as a Sequence.
|
||||
"""Allows concatenating multiple other Normalizer as a Sequence.
|
||||
|
||||
All the normalizers run in sequence in the given order
|
||||
"""
|
||||
|
||||
def __init__(self, normalizers: List[Normalizer]) -> None:
|
||||
""" Instantiate a new normalization Sequence using the given normalizers
|
||||
"""Instantiate a new normalization Sequence using the given normalizers
|
||||
|
||||
Args:
|
||||
normalizers: List[Normalizer]:
|
||||
|
||||
@@ -2,7 +2,7 @@ from .. import Encoding
|
||||
from typing import Tuple, Union, List
|
||||
|
||||
class PostProcessor:
|
||||
""" Base class for all post-processors
|
||||
"""Base class for all post-processors
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of
|
||||
a PostProcessor will return an instance of this class when instantiated.
|
||||
@@ -22,7 +22,7 @@ class PostProcessor:
|
||||
pass
|
||||
|
||||
class BertProcessing(PostProcessor):
|
||||
""" BertProcessing
|
||||
"""BertProcessing
|
||||
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Bert model:
|
||||
@@ -31,7 +31,7 @@ class BertProcessing(PostProcessor):
|
||||
"""
|
||||
|
||||
def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None:
|
||||
""" Instantiate a new BertProcessing with the given tokens
|
||||
"""Instantiate a new BertProcessing with the given tokens
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
@@ -46,7 +46,7 @@ class BertProcessing(PostProcessor):
|
||||
pass
|
||||
|
||||
class RobertaProcessing(PostProcessor):
|
||||
""" RobertaProcessing
|
||||
"""RobertaProcessing
|
||||
|
||||
This post-processor takes care of adding the special tokens needed by
|
||||
a Roberta model:
|
||||
@@ -66,7 +66,7 @@ class RobertaProcessing(PostProcessor):
|
||||
trim_offsets: bool = True,
|
||||
add_prefix_space: bool = True,
|
||||
) -> None:
|
||||
""" Instantiate a new RobertaProcessing with the given tokens
|
||||
"""Instantiate a new RobertaProcessing with the given tokens
|
||||
|
||||
Args:
|
||||
sep: Tuple[str, int]:
|
||||
@@ -88,7 +88,7 @@ class RobertaProcessing(PostProcessor):
|
||||
pass
|
||||
|
||||
class ByteLevel(PostProcessor):
|
||||
""" ByteLevel Post processing
|
||||
"""ByteLevel Post processing
|
||||
|
||||
This post-processor takes care of trimming the offsets.
|
||||
By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't
|
||||
@@ -96,7 +96,7 @@ class ByteLevel(PostProcessor):
|
||||
"""
|
||||
|
||||
def __init__(self, trim_offsets: bool = True) -> None:
|
||||
""" Instantiate a new ByteLevel
|
||||
"""Instantiate a new ByteLevel
|
||||
|
||||
Args:
|
||||
trim_offsets: bool:
|
||||
@@ -108,7 +108,7 @@ Template = Union[str, List[str]]
|
||||
Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]]
|
||||
|
||||
class TemplateProcessing(PostProcessor):
|
||||
""" TemplateProcessing
|
||||
"""TemplateProcessing
|
||||
|
||||
Provides a way to specify templates in order to add the special tokens to each
|
||||
input sequence as relevant.
|
||||
@@ -143,7 +143,7 @@ class TemplateProcessing(PostProcessor):
|
||||
"""
|
||||
|
||||
def __init__(self, seq_a: Template, seq_b: Template, special_tokens: Tokens) -> None:
|
||||
""" Instantiate a new TemplateProcessing
|
||||
"""Instantiate a new TemplateProcessing
|
||||
|
||||
Args:
|
||||
seq_a: Template
|
||||
|
||||
@@ -2,14 +2,14 @@ from .. import AddedToken
|
||||
from typing import Optional, List, Union
|
||||
|
||||
class Trainer:
|
||||
""" Base class for all trainers
|
||||
"""Base class for all trainers
|
||||
|
||||
This class is not supposed to be instantiated directly. Instead, any implementation of a
|
||||
Trainer will return an instance of this class when instantiated.
|
||||
"""
|
||||
|
||||
class BpeTrainer(Trainer):
|
||||
""" BpeTrainer
|
||||
"""BpeTrainer
|
||||
|
||||
Capable of training a BPE model
|
||||
"""
|
||||
@@ -25,7 +25,7 @@ class BpeTrainer(Trainer):
|
||||
continuing_subword_prefix: Optional[str] = None,
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
) -> None:
|
||||
""" Instantiate a new BpeTrainer with the given options:
|
||||
"""Instantiate a new BpeTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
@@ -61,7 +61,7 @@ class BpeTrainer(Trainer):
|
||||
pass
|
||||
|
||||
class WordPieceTrainer(Trainer):
|
||||
""" WordPieceTrainer
|
||||
"""WordPieceTrainer
|
||||
|
||||
Capable of training a WordPiece model
|
||||
"""
|
||||
@@ -77,7 +77,7 @@ class WordPieceTrainer(Trainer):
|
||||
continuing_subword_prefix: Optional[str] = "##",
|
||||
end_of_word_suffix: Optional[str] = None,
|
||||
) -> Trainer:
|
||||
""" Instantiate a new WordPieceTrainer with the given options:
|
||||
"""Instantiate a new WordPieceTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
@@ -113,7 +113,7 @@ class WordPieceTrainer(Trainer):
|
||||
pass
|
||||
|
||||
class UnigramTrainer(Trainer):
|
||||
""" UnigramTrainer
|
||||
"""UnigramTrainer
|
||||
|
||||
Capable of training a Unigram model
|
||||
"""
|
||||
@@ -124,7 +124,7 @@ class UnigramTrainer(Trainer):
|
||||
show_progress: bool = True,
|
||||
special_tokens: List[Union[str, AddedToken]] = [],
|
||||
) -> Trainer:
|
||||
""" Instantiate a new UnigramTrainer with the given options:
|
||||
"""Instantiate a new UnigramTrainer with the given options:
|
||||
|
||||
Args:
|
||||
vocab_size: unsigned int:
|
||||
|
||||
Reference in New Issue
Block a user