Python - Improve typings

2025-12-07 13:18:31 +00:00 · 2020-02-10 13:53:07 -05:00
parent dd9270a406
commit 43a989775e
4 changed files with 22 additions and 10 deletions
--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@@ -21,6 +21,14 @@ class IndexableString:
    Works almost like a `str`, but allows indexing on offsets
    provided on an `Encoding`
    """
+
+    def offsets(self, offsets: Tuple[int, int]) -> Optional[Tuple[int, int]]:
+        """ Convert the Encoding's offsets to the current string.
+
+        `Encoding` provides a list of offsets that are actually offsets to the Normalized
+        version of text. Calling this method with the offsets provided by `Encoding` will make
+        sure that said offsets can be used to index the `str` directly.
+        """
        pass

 class Encoding:
@@ -53,7 +61,11 @@ class Encoding:

    @property
    def offsets(self) -> List[Offsets]:
-        """ The offsets """
+        """ The offsets.
+        These offsets can be used to index any `IndexableString` directly. If you want to
+        index the original `str`, make sure to retrieve the converted offsets using the `.offsets`
+        method on the `original_str`.
+        """
        pass

    @property
--- a/bindings/python/tokenizers/decoders/init.pyi
+++ b/bindings/python/tokenizers/decoders/init.pyi
@@ -11,14 +11,14 @@ class Decoder:
        """ Decode the given list of string to a final string """
        pass

-class ByteLevel:
+class ByteLevel(Decoder):
    """ ByteLevel Decoder """

    def __init__(self) -> None:
        """ Instantiate a new ByteLevel Decoder """
        pass

-class WordPiece:
+class WordPiece(Decoder):
    """ WordPiece Decoder """

    @staticmethod
@@ -31,7 +31,7 @@ class WordPiece:
        """
        pass

-class Metaspace:
+class Metaspace(Decoder):
    """ Metaspace decoder """

    def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
@@ -48,7 +48,7 @@ class Metaspace:
        """
        pass

-class BPEDecoder:
+class BPEDecoder(Decoder):
    """ BPEDecoder """

    def __init__(self, suffix: str = "</w>") -> None:
--- a/bindings/python/tokenizers/processors/init.pyi
+++ b/bindings/python/tokenizers/processors/init.pyi
@@ -7,7 +7,7 @@ class PostProcessor:
    a PostProcessor will return an instance of this class when instantiated.
    """

-class BertProcessing:
+class BertProcessing(PostProcessor):
    """ BertProcessing

    This post-processor takes care of adding the special tokens needed by
@@ -31,7 +31,7 @@ class BertProcessing:
        """
        pass

-class RobertaProcessing:
+class RobertaProcessing(PostProcessor):
    """ RobertaProcessing

    This post-processor takes care of adding the special tokens needed by
--- a/bindings/python/tokenizers/trainers/init.pyi
+++ b/bindings/python/tokenizers/trainers/init.pyi
@@ -7,7 +7,7 @@ class Trainer:
    Trainer will return an instance of this class when instantiated.
    """

-class BpeTrainer:
+class BpeTrainer(Trainer):
    """ BpeTrainer

    Capable of training a BPE model
@@ -59,7 +59,7 @@ class BpeTrainer:
        """
        pass

-class WordPieceTrainer:
+class WordPieceTrainer(Trainer):
    """ WordPieceTrainer

    Capable of training a WordPiece model