diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi index ef5612da..95332e51 100644 --- a/bindings/python/tokenizers/__init__.pyi +++ b/bindings/python/tokenizers/__init__.pyi @@ -21,7 +21,15 @@ class IndexableString: Works almost like a `str`, but allows indexing on offsets provided on an `Encoding` """ - pass + + def offsets(self, offsets: Tuple[int, int]) -> Optional[Tuple[int, int]]: + """ Convert the Encoding's offsets to the current string. + + `Encoding` provides a list of offsets that are actually offsets to the Normalized + version of text. Calling this method with the offsets provided by `Encoding` will make + sure that said offsets can be used to index the `str` directly. + """ + pass class Encoding: """ An Encoding as returned by the Tokenizer """ @@ -53,7 +61,11 @@ class Encoding: @property def offsets(self) -> List[Offsets]: - """ The offsets """ + """ The offsets. + These offsets can be used to index any `IndexableString` directly. If you want to + index the original `str`, make sure to retrieve the converted offsets using the `.offsets` + method on the `original_str`. + """ pass @property diff --git a/bindings/python/tokenizers/decoders/__init__.pyi b/bindings/python/tokenizers/decoders/__init__.pyi index 4b04c3e1..e101aa25 100644 --- a/bindings/python/tokenizers/decoders/__init__.pyi +++ b/bindings/python/tokenizers/decoders/__init__.pyi @@ -11,14 +11,14 @@ class Decoder: """ Decode the given list of string to a final string """ pass -class ByteLevel: +class ByteLevel(Decoder): """ ByteLevel Decoder """ def __init__(self) -> None: """ Instantiate a new ByteLevel Decoder """ pass -class WordPiece: +class WordPiece(Decoder): """ WordPiece Decoder """ @staticmethod @@ -31,7 +31,7 @@ class WordPiece: """ pass -class Metaspace: +class Metaspace(Decoder): """ Metaspace decoder """ def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None: @@ -48,7 +48,7 @@ class Metaspace: """ pass -class BPEDecoder: +class BPEDecoder(Decoder): """ BPEDecoder """ def __init__(self, suffix: str = "") -> None: diff --git a/bindings/python/tokenizers/processors/__init__.pyi b/bindings/python/tokenizers/processors/__init__.pyi index 4c037eae..bf7f9f03 100644 --- a/bindings/python/tokenizers/processors/__init__.pyi +++ b/bindings/python/tokenizers/processors/__init__.pyi @@ -7,7 +7,7 @@ class PostProcessor: a PostProcessor will return an instance of this class when instantiated. """ -class BertProcessing: +class BertProcessing(PostProcessor): """ BertProcessing This post-processor takes care of adding the special tokens needed by @@ -31,7 +31,7 @@ class BertProcessing: """ pass -class RobertaProcessing: +class RobertaProcessing(PostProcessor): """ RobertaProcessing This post-processor takes care of adding the special tokens needed by diff --git a/bindings/python/tokenizers/trainers/__init__.pyi b/bindings/python/tokenizers/trainers/__init__.pyi index dae564ef..688ca85d 100644 --- a/bindings/python/tokenizers/trainers/__init__.pyi +++ b/bindings/python/tokenizers/trainers/__init__.pyi @@ -7,7 +7,7 @@ class Trainer: Trainer will return an instance of this class when instantiated. """ -class BpeTrainer: +class BpeTrainer(Trainer): """ BpeTrainer Capable of training a BPE model @@ -59,7 +59,7 @@ class BpeTrainer: """ pass -class WordPieceTrainer: +class WordPieceTrainer(Trainer): """ WordPieceTrainer Capable of training a WordPiece model