Python - Update typings for PreTokenizedString

This commit is contained in:
Anthony MOI
2020-09-21 16:26:16 -04:00
committed by Anthony MOI
parent b1097a988f
commit 0b448f46d4
5 changed files with 154 additions and 3 deletions

View File

@@ -1,6 +1,7 @@
__version__ = "0.9.0.dev1" __version__ = "0.9.0.dev1"
from typing import Tuple, Union, Tuple, List from typing import Tuple, Union, Tuple, List
from enum import Enum
Offsets = Tuple[int, int] Offsets = Tuple[int, int]
@@ -14,7 +15,26 @@ PreTokenizedEncodeInput = Union[
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
from .tokenizers import Tokenizer, Encoding, AddedToken, Regex, NormalizedString, PreTokenizedString
class OffsetReferential(Enum):
ORIGINAL = "original"
NORMALIZED = "normalized"
class OffsetType(Enum):
BYTE = "byte"
CHAR = "char"
from .tokenizers import (
Tokenizer,
Encoding,
AddedToken,
Regex,
NormalizedString,
PreTokenizedString,
Token,
)
from .tokenizers import decoders from .tokenizers import decoders
from .tokenizers import models from .tokenizers import models
from .tokenizers import normalizers from .tokenizers import normalizers

View File

@@ -12,7 +12,8 @@ from .implementations import (
BertWordPieceTokenizer as BertWordPieceTokenizer, BertWordPieceTokenizer as BertWordPieceTokenizer,
) )
from typing import Optional, Union, List, Tuple from typing import Optional, Union, List, Tuple, Callable
from enum import Enum
Offsets = Tuple[int, int] Offsets = Tuple[int, int]
@@ -26,6 +27,114 @@ PreTokenizedEncodeInput = Union[
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
class OffsetReferential(Enum):
ORIGINAL = "original"
NORMALIZED = "normalized"
class OffsetType(Enum):
BYTE = "byte"
CHAR = "char"
class Token:
id: int
token: str
offsets: Offsets
Split = Tuple[str, Offsets, List[Token]]
class PreTokenizedString:
""" PreTokenizedString
Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
underlying string, while keeping track of the alignment information (offsets).
The PreTokenizedString manages what we call `splits`. Each split represents a substring
which is a subpart of the original string, with the relevant offsets and tokens.
When calling one of the methods used to modify the PreTokenizedString (namely one of
`split`, `normalize` or `tokenize), only the `splits` that don't have any associated
tokens will get modified.
"""
def __new__(sequence: str) -> PreTokenizedString:
""" Instantiate a new PreTokenizedString using the given str
Args:
sequence: str:
The string sequence used to initialize this PreTokenizedString
"""
pass
def split(self, func: Callable[[NormalizedString], List[NormalizedString]]):
""" Split the PreTokenizedString using the given `func`
Args:
func: Callable[[NormalizedString], List[NormalizedString]]:
The function used to split each underlying split.
It is expected to return a list of `NormalizedString`, that represent the new
splits. If the given `NormalizedString` does not need any splitting, we can
just return it directly.
In order for the offsets to be tracked accurately, any returned `NormalizedString`
should come from calling either `.split` or `.slice` on the received one.
"""
pass
def normalize(self, func: Callable[[NormalizedString], None]):
""" Normalize each split of the `PreTokenizedString` using the given `func`
Args:
func: Callable[[NormalizedString], None]:
The function used to normalize each underlying split. This function
does not need to return anything, just calling the methods on the provided
NormalizedString allow its modification.
"""
pass
def tokenize(self, func: Callable[[str], List[Token]]):
""" Tokenize each split of the `PreTokenizedString` using the given `func`
Args:
func: Callable[[str], List[Token]]:
The function used to tokenize each underlying split. This function must return
a list of Token generated from the input str.
"""
pass
def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
""" Return an Encoding generated from this PreTokenizedString
Args:
type_id: int = 0:
The type_id to be used on the generated Encoding.
word_idx: Optional[int] = None:
An optional word index to be used for each token of this Encoding. If provided,
all the word indices in the generated Encoding will use this value, instead
of the one automatically tracked during pre-tokenization.
Returns:
An Encoding
"""
pass
def get_splits(
self,
offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
offset_type: OffsetType = OffsetType.CHAR,
) -> List[Split]:
""" Get the splits currently managed by the PreTokenizedString
Args:
offset_referential: OffsetReferential:
Whether the returned splits should have offsets expressed relative
to the original string, or the normalized one.
offset_type: OffsetType:
Whether the returned splits should have offsets expressed in bytes or chars.
When slicing an str, we usually want to use chars, which is the default value.
Now in some cases it might be interesting to get these offsets expressed in bytes,
so it is possible to change this here.
Returns
A list of splits
"""
pass
class Regex: class Regex:
""" A Regex """ """ A Regex """

View File

@@ -1,3 +1,4 @@
from .. import PreTokenizedString
from typing import Optional, List, Tuple from typing import Optional, List, Tuple
Offsets = Tuple[int, int] Offsets = Tuple[int, int]
@@ -9,7 +10,10 @@ class PreTokenizer:
PreTokenizer will return an instance of this class when instantiated. PreTokenizer will return an instance of this class when instantiated.
""" """
def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]: def pre_tokenize(self, pretokenized: PreTokenizedString):
""" Pre tokenize the given PreTokenizedString in-place """
pass
def pre_tokenize_str(self, sequence: str) -> List[Tuple[str, Offsets]]:
""" Pre tokenize the given sequence """ """ Pre tokenize the given sequence """
pass pass

View File

@@ -132,6 +132,7 @@ fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<tokenizer::PyTokenizer>()?; m.add_class::<tokenizer::PyTokenizer>()?;
m.add_class::<tokenizer::PyAddedToken>()?; m.add_class::<tokenizer::PyAddedToken>()?;
m.add_class::<token::PyToken>()?;
m.add_class::<encoding::PyEncoding>()?; m.add_class::<encoding::PyEncoding>()?;
m.add_class::<utils::PyRegex>()?; m.add_class::<utils::PyRegex>()?;
m.add_class::<utils::PyNormalizedString>()?; m.add_class::<utils::PyNormalizedString>()?;

View File

@@ -136,8 +136,25 @@ pub struct PyPreTokenizedString {
pub(crate) pretok: tk::PreTokenizedString, pub(crate) pretok: tk::PreTokenizedString,
} }
impl From<PreTokenizedString> for PyPreTokenizedString {
fn from(pretok: PreTokenizedString) -> Self {
Self { pretok }
}
}
impl From<PyPreTokenizedString> for PreTokenizedString {
fn from(pretok: PyPreTokenizedString) -> Self {
pretok.pretok
}
}
#[pymethods] #[pymethods]
impl PyPreTokenizedString { impl PyPreTokenizedString {
#[new]
fn new(s: &str) -> Self {
PreTokenizedString::from(s).into()
}
fn split(&mut self, func: &PyAny) -> PyResult<()> { fn split(&mut self, func: &PyAny) -> PyResult<()> {
split(&mut self.pretok, func) split(&mut self.pretok, func)
} }