mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Python - Update typings for new encode
This commit is contained in:
@ -1,8 +1,10 @@
|
|||||||
__version__ = "0.7.0"
|
__version__ = "0.7.0"
|
||||||
|
|
||||||
from typing import Tuple
|
from typing import Tuple, Union, Tuple, List
|
||||||
|
|
||||||
Offsets = Tuple[int, int]
|
Offsets = Tuple[int, int]
|
||||||
|
InputSequence = Union[str, List[str]]
|
||||||
|
EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
|
||||||
|
|
||||||
from .tokenizers import Tokenizer, Encoding, AddedToken
|
from .tokenizers import Tokenizer, Encoding, AddedToken
|
||||||
from .tokenizers import decoders
|
from .tokenizers import decoders
|
||||||
|
@ -15,6 +15,8 @@ from .implementations import (
|
|||||||
from typing import Optional, Union, List, Tuple
|
from typing import Optional, Union, List, Tuple
|
||||||
|
|
||||||
Offsets = Tuple[int, int]
|
Offsets = Tuple[int, int]
|
||||||
|
InputSequence = Union[str, List[str]]
|
||||||
|
EncodeInput = Union[InputSequence, Tuple[InputSequence, InputSequence]]
|
||||||
|
|
||||||
class Encoding:
|
class Encoding:
|
||||||
""" An Encoding as returned by the Tokenizer """
|
""" An Encoding as returned by the Tokenizer """
|
||||||
@ -369,37 +371,43 @@ class Tokenizer:
|
|||||||
The normalized string
|
The normalized string
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def encode(
|
def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
|
||||||
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
|
""" Encode the given input. This method accept both string sequences and already
|
||||||
) -> Encoding:
|
pre-tokenized sequences.
|
||||||
""" Encode the given sequence
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence: str:
|
input: EncodeInput:
|
||||||
The sequence to encode
|
This content to encode. This can be either:
|
||||||
|
- A single sequence: InputSequence
|
||||||
pair: (`optional`) Optional[str]:
|
- A pair of sequences: Tuple[InputSequence, InputSequence]
|
||||||
The optional pair sequence
|
And a InputSequence can be either:
|
||||||
|
- A string: str
|
||||||
|
- A pre-tokenized string: List[str]
|
||||||
|
|
||||||
add_special_tokens: bool:
|
add_special_tokens: bool:
|
||||||
Whether to add the special tokens while encoding
|
Whether to add the special tokens while encoding.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
An Encoding
|
An Encoding
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
def encode_batch(
|
def encode_batch(
|
||||||
self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
|
self, inputs: List[EncodeInput], add_special_tokens: bool = True
|
||||||
) -> List[Encoding]:
|
) -> List[Encoding]:
|
||||||
""" Encode the given sequences or pair of sequences
|
""" Encode the given inputs. This method accept both string sequences and already
|
||||||
|
pre-tokenized sequences.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequences: List[Union[str, Tuple[str, str]]]:
|
inputs: List[EncodeInput]:
|
||||||
A list of sequences or pair of sequences. The list can contain both
|
A list of inputs to encode. Each input can be either:
|
||||||
at the same time.
|
- A single sequence: InputSequence
|
||||||
|
- A pair of sequences: Tuple[InputSequence, InputSequence]
|
||||||
|
And a InputSequence can be either:
|
||||||
|
- A string: str
|
||||||
|
- A pre-tokenized string: List[str]
|
||||||
|
|
||||||
add_special_tokens: bool:
|
add_special_tokens: bool:
|
||||||
Whether to add the special tokens while encoding
|
Whether to add the special tokens while encoding.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of Encoding
|
A list of Encoding
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from tokenizers import Tokenizer, Encoding, AddedToken
|
from tokenizers import Tokenizer, Encoding, AddedToken, InputSequence, EncodeInput
|
||||||
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
|
from tokenizers.models import TokenizedSequence, TokenizedSequenceWithOffsets
|
||||||
|
|
||||||
from typing import List, Union, Tuple, Optional, Dict
|
from typing import List, Union, Tuple, Optional, Dict
|
||||||
@ -151,72 +151,18 @@ class BaseTokenizer:
|
|||||||
"""
|
"""
|
||||||
return self._tokenizer.normalize(sequence)
|
return self._tokenizer.normalize(sequence)
|
||||||
|
|
||||||
def encode_tokenized(
|
def encode(self, input: EncodeInput, add_special_tokens: bool = True) -> Encoding:
|
||||||
self, sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets], type_id: int = 0
|
""" Encode the given input. This method accept both string sequences and already
|
||||||
) -> Encoding:
|
pre-tokenized sequences.
|
||||||
""" Encode the given sequence. Let us skip the Normalizer and PreTokenizer by providing
|
|
||||||
already tokenized substrings.
|
|
||||||
|
|
||||||
A sequence can either be:
|
|
||||||
- `TokenizedSequence`: (`List[str]`)
|
|
||||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
|
||||||
a Tuple[int, int].
|
|
||||||
|
|
||||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
|
||||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequence: Union[TokenizedSequence, TokenizedSequenceWithOffsets]
|
input: EncodeInput:
|
||||||
Either a TokenizedSequence or a TokenizedSequenceWithOffsets
|
This content to encode. This can be either:
|
||||||
|
- A single sequence: InputSequence
|
||||||
type_id: int:
|
- A pair of sequences: Tuple[InputSequence, InputSequence]
|
||||||
The type id of the given sequence
|
And a InputSequence can be either:
|
||||||
|
- A string: str
|
||||||
Returns:
|
- A pre-tokenized string: List[str]
|
||||||
An Encoding
|
|
||||||
"""
|
|
||||||
return self._tokenizer.model.encode(sequence, type_id)
|
|
||||||
|
|
||||||
def encode_tokenized_batch(
|
|
||||||
self,
|
|
||||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]],
|
|
||||||
type_id: int = 0,
|
|
||||||
) -> List[Encoding]:
|
|
||||||
""" Encode the given batch of sequence. Let us skip the Normalizer and PreTokenizer by
|
|
||||||
providing already tokenized substrings.
|
|
||||||
|
|
||||||
A sequence can either be:
|
|
||||||
- `TokenizedSequence`: (`List[str]`)
|
|
||||||
- `TokenizedSequenceWithOffsets: (`List[Tuple[str, Offsets]]`) where Offsets is
|
|
||||||
a Tuple[int, int].
|
|
||||||
|
|
||||||
If the Offsets are not provided, they will be automatically generated, making the hypothesis
|
|
||||||
that all the tokens in the `TokenizedSequence` are contiguous in the original string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sequences: Union[List[TokenizedSequence], List[TokenizedSequenceWithOffsets]]
|
|
||||||
A list of sequence. Each sequence is either a TokenizedSequence or a
|
|
||||||
TokenizedSequenceWithOffsets
|
|
||||||
|
|
||||||
type_id: int:
|
|
||||||
The type if of the given sequence
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of Encoding
|
|
||||||
"""
|
|
||||||
return self._tokenizer.model.encode_batch(sequences, type_id)
|
|
||||||
|
|
||||||
def encode(
|
|
||||||
self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
|
|
||||||
) -> Encoding:
|
|
||||||
""" Encode the given sequence
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sequence: str:
|
|
||||||
The sequence to encode
|
|
||||||
|
|
||||||
pair: (`optional`) Optional[str]:
|
|
||||||
The optional pair sequence
|
|
||||||
|
|
||||||
add_special_tokens: bool:
|
add_special_tokens: bool:
|
||||||
Whether to add the special tokens while encoding.
|
Whether to add the special tokens while encoding.
|
||||||
@ -224,20 +170,25 @@ class BaseTokenizer:
|
|||||||
Returns:
|
Returns:
|
||||||
An Encoding
|
An Encoding
|
||||||
"""
|
"""
|
||||||
if sequence is None:
|
if input is None:
|
||||||
raise ValueError("None input is not valid. Should be a string.")
|
raise ValueError("None input is not valid. Should be a string.")
|
||||||
|
|
||||||
return self._tokenizer.encode(sequence, pair, add_special_tokens)
|
return self._tokenizer.encode(input, add_special_tokens)
|
||||||
|
|
||||||
def encode_batch(
|
def encode_batch(
|
||||||
self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
|
self, inputs: List[EncodeInput], add_special_tokens: bool = True
|
||||||
) -> List[Encoding]:
|
) -> List[Encoding]:
|
||||||
""" Encode the given sequences or pair of sequences
|
""" Encode the given inputs. This method accept both string sequences and already
|
||||||
|
pre-tokenized sequences.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
sequences: List[Union[str, Tuple[str, str]]]:
|
inputs: List[EncodeInput]:
|
||||||
A list of sequences or pair of sequences. The list can contain both
|
A list of inputs to encode. Each input can be either:
|
||||||
at the same time.
|
- A single sequence: InputSequence
|
||||||
|
- A pair of sequences: Tuple[InputSequence, InputSequence]
|
||||||
|
And a InputSequence can be either:
|
||||||
|
- A string: str
|
||||||
|
- A pre-tokenized string: List[str]
|
||||||
|
|
||||||
add_special_tokens: bool:
|
add_special_tokens: bool:
|
||||||
Whether to add the special tokens while encoding.
|
Whether to add the special tokens while encoding.
|
||||||
@ -246,12 +197,12 @@ class BaseTokenizer:
|
|||||||
A list of Encoding
|
A list of Encoding
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if sequences is None:
|
if inputs is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"None input is not valid. Should be a list of strings or a list of tuple of strings."
|
"None input is not valid. Should be a list of strings or a list of tuple of strings."
|
||||||
)
|
)
|
||||||
|
|
||||||
return self._tokenizer.encode_batch(sequences, add_special_tokens)
|
return self._tokenizer.encode_batch(inputs, add_special_tokens)
|
||||||
|
|
||||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||||
""" Decode the given list of ids to a string sequence
|
""" Decode the given list of ids to a string sequence
|
||||||
|
Reference in New Issue
Block a user