Python - Update typings for new encode

This commit is contained in:
Anthony MOI
2020-04-24 21:28:31 -04:00
parent 835f08ab02
commit 2e105c4258
3 changed files with 52 additions and 91 deletions

View File

@ -1,8 +1,10 @@
__version__ = "0.7.0" __version__ = "0.7.0"
from typing import Tuple from typing import Tuple, Union, Tuple, List
Offsets = Tuple[int, int] Offsets = Tuple[int, int]
InputSequence = Union[str, List[str]]
EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
from .tokenizers import Tokenizer, Encoding, AddedToken from .tokenizers import Tokenizer, Encoding, AddedToken
from .tokenizers import decoders from .tokenizers import decoders

View File

@ -15,6 +15,8 @@ from .implementations import (
from typing import Optional, Union, List, Tuple from typing import Optional, Union, List, Tuple
Offsets = Tuple[int, int] Offsets = Tuple[int, int]
InputSequence = Union[str, List[str]]
EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
class Encoding: class Encoding:
""" An Encoding as returned by the Tokenizer """ """ An Encoding as returned by the Tokenizer """
@ -369,37 +371,43 @@ class Tokenizer:
The normalized string The normalized string
""" """
pass pass
def encode( def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True """ Encode the given input. This method accept both string sequences and already
) -> Encoding: pre-tokenized sequences.
""" Encode the given sequence
Args: Args:
sequence: str: input: EncodeInput:
The sequence to encode This content to encode. This can be either:
- A single sequence: InputSequence
pair: (`optional`) Optional[str]: - A pair of sequences: Tuple[InputSequence, InputSequence]
The optional pair sequence And a InputSequence can be either:
- A string: str
- A pre-tokenized string: List[str]
add_special_tokens: bool: add_special_tokens: bool:
Whether to add the special tokens while encoding Whether to add the special tokens while encoding.
Returns: Returns:
An Encoding An Encoding
""" """
pass pass
def encode_batch( def encode_batch(
self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True self, inputs: List[EncodeInput], add_special_tokens: bool = True
) -> List[Encoding]: ) -> List[Encoding]:
""" Encode the given sequences or pair of sequences """ Encode the given inputs. This method accept both string sequences and already
pre-tokenized sequences.
Args: Args:
sequences: List[Union[str, Tuple[str, str]]]: inputs: List[EncodeInput]:
A list of sequences or pair of sequences. The list can contain both A list of inputs to encode. Each input can be either:
at the same time. - A single sequence: InputSequence
- A pair of sequences: Tuple[InputSequence, InputSequence]
And a InputSequence can be either:
- A string: str
- A pre-tokenized string: List[str]
add_special_tokens: bool: add_special_tokens: bool:
Whether to add the special tokens while encoding Whether to add the special tokens while encoding.
Returns: Returns:
A list of Encoding A list of Encoding

View File

@ -1,4 +1,4 @@
from tokenizers import Tokenizer, Encoding, AddedToken from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
from typing import List, Union, Tuple, Optional, Dict from typing import List, Union, Tuple, Optional, Dict
@ -151,72 +151,18 @@ class BaseTokenizer:
""" """
return self._tokenizer.normalize(sequence) return self._tokenizer.normalize(sequence)
def encode_tokenized( def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0 """ Encode the given input. This method accept both string sequences and already
) -> Encoding: pre-tokenized sequences.
""" Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
already tokenized substrings.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args: Args:
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets] input: EncodeInput:
Either a TokenizedSequence or a TokenizedSequenceWithOffsets This content to encode. This can be either:
- A single sequence: InputSequence
type_id: int: - A pair of sequences: Tuple[InputSequence, InputSequence]
The type id of the given sequence And a InputSequence can be either:
- A string: str
Returns: - A pre-tokenized string: List[str]
An Encoding
"""
return self._tokenizer.model.encode(sequence, type_id)
def encode_tokenized_batch(
self,
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
type_id: int = 0,
) -> List[Encoding]:
""" Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
providing already tokenized substrings.
A sequence can either be:
- `TokenizedSequence`: (`List[str]`)
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
a Tuple[int, int].
If the Offsets are not provided, they will be automatically generated, making the hypothesis
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
Args:
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
A list of sequence. Each sequence is either a TokenizedSequence or a
TokenizedSequenceWithOffsets
type_id: int:
The type if of the given sequence
Returns:
A list of Encoding
"""
return self._tokenizer.model.encode_batch(sequences, type_id)
def encode(
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
) -> Encoding:
""" Encode the given sequence
Args:
sequence: str:
The sequence to encode
pair: (`optional`) Optional[str]:
The optional pair sequence
add_special_tokens: bool: add_special_tokens: bool:
Whether to add the special tokens while encoding. Whether to add the special tokens while encoding.
@ -224,20 +170,25 @@ class BaseTokenizer:
Returns: Returns:
An Encoding An Encoding
""" """
if sequence is None: if input is None:
raise ValueError("None input is not valid. Should be a string.") raise ValueError("None input is not valid. Should be a string.")
return self._tokenizer.encode(sequence, pair, add_special_tokens) return self._tokenizer.encode(input, add_special_tokens)
def encode_batch( def encode_batch(
self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True self, inputs: List[EncodeInput], add_special_tokens: bool = True
) -> List[Encoding]: ) -> List[Encoding]:
""" Encode the given sequences or pair of sequences """ Encode the given inputs. This method accept both string sequences and already
pre-tokenized sequences.
Args: Args:
sequences: List[Union[str, Tuple[str, str]]]: inputs: List[EncodeInput]:
A list of sequences or pair of sequences. The list can contain both A list of inputs to encode. Each input can be either:
at the same time. - A single sequence: InputSequence
- A pair of sequences: Tuple[InputSequence, InputSequence]
And a InputSequence can be either:
- A string: str
- A pre-tokenized string: List[str]
add_special_tokens: bool: add_special_tokens: bool:
Whether to add the special tokens while encoding. Whether to add the special tokens while encoding.
@ -246,12 +197,12 @@ class BaseTokenizer:
A list of Encoding A list of Encoding
""" """
if sequences is None: if inputs is None:
raise ValueError( raise ValueError(
"None input is not valid. Should be a list of strings or a list of tuple of strings." "None input is not valid. Should be a list of strings or a list of tuple of strings."
) )
return self._tokenizer.encode_batch(sequences, add_special_tokens) return self._tokenizer.encode_batch(inputs, add_special_tokens)
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str: def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
""" Decode the given list of ids to a string sequence """ Decode the given list of ids to a string sequence