Python - Improve typings for new encode/encode_batch

This commit is contained in:
Anthony MOI
2020-04-28 17:52:36 -04:00
parent 3fb8033770
commit 30216190e5
3 changed files with 101 additions and 49 deletions

View File

@ -3,8 +3,16 @@ __version__ = "0.7.0"
from typing import Tuple, Union, Tuple, List
Offsets = Tuple[int, int]
InputSequence = Union[str, List[str]]
EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence]
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
from .tokenizers import Tokenizer, Encoding, AddedToken
from .tokenizers import decoders

View File

@ -15,8 +15,16 @@ from .implementations import (
from typing import Optional, Union, List, Tuple
Offsets = Tuple[int, int]
InputSequence = Union[str, List[str]]
EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
TextInputSequence = str
PreTokenizedInputSequence = Union[List[str], Tuple[str]]
TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]]
PreTokenizedEncodeInput = Union[
PreTokenizedInputSequence, Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence],
]
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
class Encoding:
""" An Encoding as returned by the Tokenizer """
@ -192,7 +200,7 @@ class AddedToken:
"""
def __new__(
cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False
cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
) -> AddedToken:
""" Instantiate a new AddedToken
@ -371,18 +379,27 @@ class Tokenizer:
The normalized string
"""
pass
def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
""" Encode the given input. This method accept both string sequences and already
pre-tokenized sequences.
def encode(
self,
sequence: InputSequence,
pair: Optional[InputSequence],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
""" Encode the given sequence and pair. This method can process raw text sequences as well
as already pre-tokenized sequences.
Args:
input: EncodeInput:
This content to encode. This can be either:
- A single sequence: InputSequence
- A pair of sequences: Tuple[InputSequence, InputSequence]
And a InputSequence can be either:
- A string: str
- A pre-tokenized string: List[str]
sequence: InputSequence:
The sequence we want to encode. This sequence can be either raw text or
pre-tokenized, according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized
add_special_tokens: bool:
Whether to add the special tokens while encoding.
@ -392,19 +409,29 @@ class Tokenizer:
"""
pass
def encode_batch(
self, inputs: List[EncodeInput], add_special_tokens: bool = True
self,
inputs: List[EncodeInput],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
""" Encode the given inputs. This method accept both string sequences and already
""" Encode the given inputs. This method accept both raw text sequences as well as already
pre-tokenized sequences.
Args:
inputs: List[EncodeInput]:
A list of inputs to encode. Each input can be either:
- A single sequence: InputSequence
- A pair of sequences: Tuple[InputSequence, InputSequence]
And a InputSequence can be either:
- A string: str
- A pre-tokenized string: List[str]
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
expected to be of the following form:
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
Each `InputSequence` can either be raw text or pre-tokenized,
according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
@ -493,7 +520,7 @@ class Tokenizer:
"""
pass
def post_process(
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True
self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True,
) -> Encoding:
""" Apply all the post-processing steps to the given encodings.

View File

@ -151,18 +151,27 @@ class BaseTokenizer:
"""
return self._tokenizer.normalize(sequence)
def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
""" Encode the given input. This method accept both string sequences and already
pre-tokenized sequences.
def encode(
self,
sequence: InputSequence,
pair: Optional[InputSequence] = None,
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> Encoding:
""" Encode the given sequence and pair. This method can process raw text sequences as well
as already pre-tokenized sequences.
Args:
input: EncodeInput:
This content to encode. This can be either:
- A single sequence: InputSequence
- A pair of sequences: Tuple[InputSequence, InputSequence]
And a InputSequence can be either:
- A string: str
- A pre-tokenized string: List[str]
sequence: InputSequence:
The sequence we want to encode. This sequence can be either raw text or
pre-tokenized, according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
@ -170,25 +179,35 @@ class BaseTokenizer:
Returns:
An Encoding
"""
if input is None:
raise ValueError("None input is not valid. Should be a string.")
if sequence is None:
raise ValueError("encode: `sequence` can't be `None`")
return self._tokenizer.encode(input, add_special_tokens)
return self._tokenizer.encode(sequence, pair, is_pretokenized, add_special_tokens)
def encode_batch(
self, inputs: List[EncodeInput], add_special_tokens: bool = True
self,
inputs: List[EncodeInput],
is_pretokenized: bool = False,
add_special_tokens: bool = True,
) -> List[Encoding]:
""" Encode the given inputs. This method accept both string sequences and already
""" Encode the given inputs. This method accept both raw text sequences as well as already
pre-tokenized sequences.
Args:
inputs: List[EncodeInput]:
A list of inputs to encode. Each input can be either:
- A single sequence: InputSequence
- A pair of sequences: Tuple[InputSequence, InputSequence]
And a InputSequence can be either:
- A string: str
- A pre-tokenized string: List[str]
A list of single sequences or pair sequences to encode. Each `EncodeInput` is
expected to be of the following form:
`Union[InputSequence, Tuple[InputSequence, InputSequence]]`
Each `InputSequence` can either be raw text or pre-tokenized,
according to the `is_pretokenized` argument:
- If `is_pretokenized=False`: `InputSequence` is expected to be `str`
- If `is_pretokenized=True`: `InputSequence` is expected to be
`Union[List[str], Tuple[str]]`
is_pretokenized: bool:
Whether the input is already pre-tokenized.
add_special_tokens: bool:
Whether to add the special tokens while encoding.
@ -198,11 +217,9 @@ class BaseTokenizer:
"""
if inputs is None:
raise ValueError(
"None input is not valid. Should be a list of strings or a list of tuple of strings."
)
raise ValueError("encode_batch: `inputs` can't be `None`")
return self._tokenizer.encode_batch(inputs, add_special_tokens)
return self._tokenizer.encode_batch(inputs, is_pretokenized, add_special_tokens)
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the given list of ids to a string sequence