mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-04 19:58:21 +00:00
Python - Update typings for PreTokenizedString
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
__version__ = "0.9.0.dev1"
|
__version__ = "0.9.0.dev1"
|
||||||
|
|
||||||
from typing import Tuple, Union, Tuple, List
|
from typing import Tuple, Union, Tuple, List
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
Offsets = Tuple[int, int]
|
Offsets = Tuple[int, int]
|
||||||
|
|
||||||
@@ -14,7 +15,26 @@ PreTokenizedEncodeInput = Union[
|
|||||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||||
|
|
||||||
from .tokenizers import Tokenizer, Encoding, AddedToken, Regex, NormalizedString, PreTokenizedString
|
|
||||||
|
class OffsetReferential(Enum):
|
||||||
|
ORIGINAL = "original"
|
||||||
|
NORMALIZED = "normalized"
|
||||||
|
|
||||||
|
|
||||||
|
class OffsetType(Enum):
|
||||||
|
BYTE = "byte"
|
||||||
|
CHAR = "char"
|
||||||
|
|
||||||
|
|
||||||
|
from .tokenizers import (
|
||||||
|
Tokenizer,
|
||||||
|
Encoding,
|
||||||
|
AddedToken,
|
||||||
|
Regex,
|
||||||
|
NormalizedString,
|
||||||
|
PreTokenizedString,
|
||||||
|
Token,
|
||||||
|
)
|
||||||
from .tokenizers import decoders
|
from .tokenizers import decoders
|
||||||
from .tokenizers import models
|
from .tokenizers import models
|
||||||
from .tokenizers import normalizers
|
from .tokenizers import normalizers
|
||||||
|
|||||||
@@ -12,7 +12,8 @@ from .implementations import (
|
|||||||
BertWordPieceTokenizer as BertWordPieceTokenizer,
|
BertWordPieceTokenizer as BertWordPieceTokenizer,
|
||||||
)
|
)
|
||||||
|
|
||||||
from typing import Optional, Union, List, Tuple
|
from typing import Optional, Union, List, Tuple, Callable
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
Offsets = Tuple[int, int]
|
Offsets = Tuple[int, int]
|
||||||
|
|
||||||
@@ -26,6 +27,114 @@ PreTokenizedEncodeInput = Union[
|
|||||||
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
|
||||||
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]
|
||||||
|
|
||||||
|
class OffsetReferential(Enum):
|
||||||
|
ORIGINAL = "original"
|
||||||
|
NORMALIZED = "normalized"
|
||||||
|
|
||||||
|
class OffsetType(Enum):
|
||||||
|
BYTE = "byte"
|
||||||
|
CHAR = "char"
|
||||||
|
|
||||||
|
class Token:
|
||||||
|
id: int
|
||||||
|
token: str
|
||||||
|
offsets: Offsets
|
||||||
|
|
||||||
|
Split = Tuple[str, Offsets, List[Token]]
|
||||||
|
|
||||||
|
class PreTokenizedString:
|
||||||
|
""" PreTokenizedString
|
||||||
|
|
||||||
|
Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
|
||||||
|
underlying string, while keeping track of the alignment information (offsets).
|
||||||
|
|
||||||
|
The PreTokenizedString manages what we call `splits`. Each split represents a substring
|
||||||
|
which is a subpart of the original string, with the relevant offsets and tokens.
|
||||||
|
|
||||||
|
When calling one of the methods used to modify the PreTokenizedString (namely one of
|
||||||
|
`split`, `normalize` or `tokenize), only the `splits` that don't have any associated
|
||||||
|
tokens will get modified.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __new__(sequence: str) -> PreTokenizedString:
|
||||||
|
""" Instantiate a new PreTokenizedString using the given str
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sequence: str:
|
||||||
|
The string sequence used to initialize this PreTokenizedString
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
def split(self, func: Callable[[NormalizedString], List[NormalizedString]]):
|
||||||
|
""" Split the PreTokenizedString using the given `func`
|
||||||
|
|
||||||
|
Args:
|
||||||
|
func: Callable[[NormalizedString], List[NormalizedString]]:
|
||||||
|
The function used to split each underlying split.
|
||||||
|
It is expected to return a list of `NormalizedString`, that represent the new
|
||||||
|
splits. If the given `NormalizedString` does not need any splitting, we can
|
||||||
|
just return it directly.
|
||||||
|
In order for the offsets to be tracked accurately, any returned `NormalizedString`
|
||||||
|
should come from calling either `.split` or `.slice` on the received one.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
def normalize(self, func: Callable[[NormalizedString], None]):
|
||||||
|
""" Normalize each split of the `PreTokenizedString` using the given `func`
|
||||||
|
|
||||||
|
Args:
|
||||||
|
func: Callable[[NormalizedString], None]:
|
||||||
|
The function used to normalize each underlying split. This function
|
||||||
|
does not need to return anything, just calling the methods on the provided
|
||||||
|
NormalizedString allow its modification.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
def tokenize(self, func: Callable[[str], List[Token]]):
|
||||||
|
""" Tokenize each split of the `PreTokenizedString` using the given `func`
|
||||||
|
|
||||||
|
Args:
|
||||||
|
func: Callable[[str], List[Token]]:
|
||||||
|
The function used to tokenize each underlying split. This function must return
|
||||||
|
a list of Token generated from the input str.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
|
||||||
|
""" Return an Encoding generated from this PreTokenizedString
|
||||||
|
|
||||||
|
Args:
|
||||||
|
type_id: int = 0:
|
||||||
|
The type_id to be used on the generated Encoding.
|
||||||
|
|
||||||
|
word_idx: Optional[int] = None:
|
||||||
|
An optional word index to be used for each token of this Encoding. If provided,
|
||||||
|
all the word indices in the generated Encoding will use this value, instead
|
||||||
|
of the one automatically tracked during pre-tokenization.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An Encoding
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
def get_splits(
|
||||||
|
self,
|
||||||
|
offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
|
||||||
|
offset_type: OffsetType = OffsetType.CHAR,
|
||||||
|
) -> List[Split]:
|
||||||
|
""" Get the splits currently managed by the PreTokenizedString
|
||||||
|
|
||||||
|
Args:
|
||||||
|
offset_referential: OffsetReferential:
|
||||||
|
Whether the returned splits should have offsets expressed relative
|
||||||
|
to the original string, or the normalized one.
|
||||||
|
|
||||||
|
offset_type: OffsetType:
|
||||||
|
Whether the returned splits should have offsets expressed in bytes or chars.
|
||||||
|
When slicing an str, we usually want to use chars, which is the default value.
|
||||||
|
Now in some cases it might be interesting to get these offsets expressed in bytes,
|
||||||
|
so it is possible to change this here.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
A list of splits
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
class Regex:
|
class Regex:
|
||||||
""" A Regex """
|
""" A Regex """
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
from .. import PreTokenizedString
|
||||||
from typing import Optional, List, Tuple
|
from typing import Optional, List, Tuple
|
||||||
|
|
||||||
Offsets = Tuple[int, int]
|
Offsets = Tuple[int, int]
|
||||||
@@ -9,7 +10,10 @@ class PreTokenizer:
|
|||||||
PreTokenizer will return an instance of this class when instantiated.
|
PreTokenizer will return an instance of this class when instantiated.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]:
|
def pre_tokenize(self, pretokenized: PreTokenizedString):
|
||||||
|
""" Pre tokenize the given PreTokenizedString in-place """
|
||||||
|
pass
|
||||||
|
def pre_tokenize_str(self, sequence: str) -> List[Tuple[str, Offsets]]:
|
||||||
""" Pre tokenize the given sequence """
|
""" Pre tokenize the given sequence """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -132,6 +132,7 @@ fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
|
|
||||||
m.add_class::<tokenizer::PyTokenizer>()?;
|
m.add_class::<tokenizer::PyTokenizer>()?;
|
||||||
m.add_class::<tokenizer::PyAddedToken>()?;
|
m.add_class::<tokenizer::PyAddedToken>()?;
|
||||||
|
m.add_class::<token::PyToken>()?;
|
||||||
m.add_class::<encoding::PyEncoding>()?;
|
m.add_class::<encoding::PyEncoding>()?;
|
||||||
m.add_class::<utils::PyRegex>()?;
|
m.add_class::<utils::PyRegex>()?;
|
||||||
m.add_class::<utils::PyNormalizedString>()?;
|
m.add_class::<utils::PyNormalizedString>()?;
|
||||||
|
|||||||
@@ -136,8 +136,25 @@ pub struct PyPreTokenizedString {
|
|||||||
pub(crate) pretok: tk::PreTokenizedString,
|
pub(crate) pretok: tk::PreTokenizedString,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<PreTokenizedString> for PyPreTokenizedString {
|
||||||
|
fn from(pretok: PreTokenizedString) -> Self {
|
||||||
|
Self { pretok }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<PyPreTokenizedString> for PreTokenizedString {
|
||||||
|
fn from(pretok: PyPreTokenizedString) -> Self {
|
||||||
|
pretok.pretok
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyPreTokenizedString {
|
impl PyPreTokenizedString {
|
||||||
|
#[new]
|
||||||
|
fn new(s: &str) -> Self {
|
||||||
|
PreTokenizedString::from(s).into()
|
||||||
|
}
|
||||||
|
|
||||||
fn split(&mut self, func: &PyAny) -> PyResult<()> {
|
fn split(&mut self, func: &PyAny) -> PyResult<()> {
|
||||||
split(&mut self.pretok, func)
|
split(&mut self.pretok, func)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user