Python - Update typings for PreTokenizedString

2025-12-04 03:38:23 +00:00 · 2020-09-21 16:26:16 -04:00
parent b1097a988f
commit 0b448f46d4
5 changed files with 154 additions and 3 deletions
--- a/bindings/python/py_src/tokenizers/init.py
+++ b/bindings/python/py_src/tokenizers/init.py
@@ -1,6 +1,7 @@
 __version__ = "0.9.0.dev1"

 from typing import Tuple, Union, Tuple, List
+from enum import Enum

 Offsets = Tuple[int, int]

@@ -14,7 +15,26 @@ PreTokenizedEncodeInput = Union[
 InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
 EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]

-from .tokenizers import Tokenizer, Encoding, AddedToken, Regex, NormalizedString, PreTokenizedString
+
+class OffsetReferential(Enum):
+    ORIGINAL = "original"
+    NORMALIZED = "normalized"
+
+
+class OffsetType(Enum):
+    BYTE = "byte"
+    CHAR = "char"
+
+
+from .tokenizers import (
+    Tokenizer,
+    Encoding,
+    AddedToken,
+    Regex,
+    NormalizedString,
+    PreTokenizedString,
+    Token,
+)
 from .tokenizers import decoders
 from .tokenizers import models
 from .tokenizers import normalizers
--- a/bindings/python/py_src/tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/init.pyi
@@ -12,7 +12,8 @@ from .implementations import (
    BertWordPieceTokenizer as BertWordPieceTokenizer,
 )

-from typing import Optional, Union, List, Tuple
+from typing import Optional, Union, List, Tuple, Callable
+from enum import Enum

 Offsets = Tuple[int, int]

@@ -26,6 +27,114 @@ PreTokenizedEncodeInput = Union[
 InputSequence = Union[TextInputSequence, PreTokenizedInputSequence]
 EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput]

+class OffsetReferential(Enum):
+    ORIGINAL = "original"
+    NORMALIZED = "normalized"
+
+class OffsetType(Enum):
+    BYTE = "byte"
+    CHAR = "char"
+
+class Token:
+    id: int
+    token: str
+    offsets: Offsets
+
+Split = Tuple[str, Offsets, List[Token]]
+
+class PreTokenizedString:
+    """ PreTokenizedString
+
+    Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the
+    underlying string, while keeping track of the alignment information (offsets).
+
+    The PreTokenizedString manages what we call `splits`. Each split represents a substring
+    which is a subpart of the original string, with the relevant offsets and tokens.
+
+    When calling one of the methods used to modify the PreTokenizedString (namely one of
+    `split`, `normalize` or `tokenize), only the `splits` that don't have any associated
+    tokens will get modified.
+    """
+
+    def __new__(sequence: str) -> PreTokenizedString:
+        """ Instantiate a new PreTokenizedString using the given str
+
+        Args:
+            sequence: str:
+                The string sequence used to initialize this PreTokenizedString
+        """
+        pass
+    def split(self, func: Callable[[NormalizedString], List[NormalizedString]]):
+        """ Split the PreTokenizedString using the given `func`
+
+        Args:
+            func: Callable[[NormalizedString], List[NormalizedString]]:
+                The function used to split each underlying split.
+                It is expected to return a list of `NormalizedString`, that represent the new
+                splits. If the given `NormalizedString` does not need any splitting, we can
+                just return it directly.
+                In order for the offsets to be tracked accurately, any returned `NormalizedString`
+                should come from calling either `.split` or `.slice` on the received one.
+        """
+        pass
+    def normalize(self, func: Callable[[NormalizedString], None]):
+        """ Normalize each split of the `PreTokenizedString` using the given `func`
+
+        Args:
+            func: Callable[[NormalizedString], None]:
+                The function used to normalize each underlying split. This function
+                does not need to return anything, just calling the methods on the provided
+                NormalizedString allow its modification.
+        """
+        pass
+    def tokenize(self, func: Callable[[str], List[Token]]):
+        """ Tokenize each split of the `PreTokenizedString` using the given `func`
+
+        Args:
+            func: Callable[[str], List[Token]]:
+                The function used to tokenize each underlying split. This function must return
+                a list of Token generated from the input str.
+        """
+        pass
+    def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding:
+        """ Return an Encoding generated from this PreTokenizedString
+
+        Args:
+            type_id: int = 0:
+                The type_id to be used on the generated Encoding.
+
+            word_idx: Optional[int] = None:
+                An optional word index to be used for each token of this Encoding. If provided,
+                all the word indices in the generated Encoding will use this value, instead
+                of the one automatically tracked during pre-tokenization.
+
+        Returns:
+            An Encoding
+        """
+        pass
+    def get_splits(
+        self,
+        offset_referential: OffsetReferential = OffsetReferential.ORIGINAL,
+        offset_type: OffsetType = OffsetType.CHAR,
+    ) -> List[Split]:
+        """ Get the splits currently managed by the PreTokenizedString
+
+        Args:
+            offset_referential: OffsetReferential:
+                Whether the returned splits should have offsets expressed relative
+                to the original string, or the normalized one.
+
+            offset_type: OffsetType:
+                Whether the returned splits should have offsets expressed in bytes or chars.
+                When slicing an str, we usually want to use chars, which is the default value.
+                Now in some cases it might be interesting to get these offsets expressed in bytes,
+                so it is possible to change this here.
+
+        Returns
+            A list of splits
+        """
+        pass
+
 class Regex:
    """ A Regex """

--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -1,3 +1,4 @@
+from .. import PreTokenizedString
 from typing import Optional, List, Tuple

 Offsets = Tuple[int, int]
@@ -9,7 +10,10 @@ class PreTokenizer:
    PreTokenizer will return an instance of this class when instantiated.
    """

-    def pre_tokenize(self, sequence: str) -> List[Tuple[str, Offsets]]:
+    def pre_tokenize(self, pretokenized: PreTokenizedString):
+        """ Pre tokenize the given PreTokenizedString in-place """
+        pass
+    def pre_tokenize_str(self, sequence: str) -> List[Tuple[str, Offsets]]:
        """ Pre tokenize the given sequence """
        pass

--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -132,6 +132,7 @@ fn tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {

    m.add_class::<tokenizer::PyTokenizer>()?;
    m.add_class::<tokenizer::PyAddedToken>()?;
+    m.add_class::<token::PyToken>()?;
    m.add_class::<encoding::PyEncoding>()?;
    m.add_class::<utils::PyRegex>()?;
    m.add_class::<utils::PyNormalizedString>()?;
--- a/bindings/python/src/utils/pretokenization.rs
+++ b/bindings/python/src/utils/pretokenization.rs
@@ -136,8 +136,25 @@ pub struct PyPreTokenizedString {
    pub(crate) pretok: tk::PreTokenizedString,
 }

+impl From<PreTokenizedString> for PyPreTokenizedString {
+    fn from(pretok: PreTokenizedString) -> Self {
+        Self { pretok }
+    }
+}
+
+impl From<PyPreTokenizedString> for PreTokenizedString {
+    fn from(pretok: PyPreTokenizedString) -> Self {
+        pretok.pretok
+    }
+}
+
 #[pymethods]
 impl PyPreTokenizedString {
+    #[new]
+    fn new(s: &str) -> Self {
+        PreTokenizedString::from(s).into()
+    }
+
    fn split(&mut self, func: &PyAny) -> PyResult<()> {
        split(&mut self.pretok, func)
    }