From 352c92ad33a3dad0eafde3c45b09657e22daa9e9 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 17 Nov 2020 21:13:00 +0100 Subject: [PATCH] Automatically stubbing the `pyi` files while keeping inspecting ability (#509) * First pass on automatic stubbing our python files. * And now modifying all rust docs to be visible in Pyi files. * Better assert fail message. * Fixing github workflow. * Removing types not exported anymore. * Fixing `Tokenizer` signature. * Disabling auto __init__.py. * Re-enabling some types. * Don't overwrite non automated __init__.py * Automated most __init__.py * Restubbing after rebase. * Fixing env for tests. * Install blakc in the env. * Use PY35 target in stub.py Co-authored-by: Anthony MOI --- .github/workflows/python.yml | 34 +- bindings/python/Makefile | 2 + .../python/py_src/tokenizers/__init__.pyi | 1604 ++++++++--------- .../py_src/tokenizers/decoders/__init__.pyi | 122 +- .../py_src/tokenizers/models/__init__.py | 9 +- .../py_src/tokenizers/models/__init__.pyi | 246 ++- .../tokenizers/normalizers/__init__.pyi | 308 +++- .../tokenizers/pre_tokenizers/__init__.py | 13 +- .../tokenizers/pre_tokenizers/__init__.pyi | 293 +-- .../py_src/tokenizers/processors/__init__.py | 3 +- .../py_src/tokenizers/processors/__init__.pyi | 208 ++- .../py_src/tokenizers/trainers/__init__.py | 3 +- .../py_src/tokenizers/trainers/__init__.pyi | 230 ++- bindings/python/src/decoders.rs | 35 + bindings/python/src/encoding.rs | 20 +- bindings/python/src/models.rs | 95 +- bindings/python/src/normalizers.rs | 57 + bindings/python/src/pre_tokenizers.rs | 68 + bindings/python/src/processors.rs | 119 ++ bindings/python/src/tokenizer.rs | 42 +- bindings/python/src/trainers.rs | 90 + bindings/python/src/utils/normalization.rs | 68 + bindings/python/src/utils/pretokenization.rs | 74 + bindings/python/src/utils/regex.rs | 2 + bindings/python/stub.py | 192 ++ 25 files changed, 2511 insertions(+), 1426 deletions(-) create mode 100644 bindings/python/stub.py diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index b7baee2f..5bc55889 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -11,26 +11,6 @@ on: - bindings/node/** jobs: - code_quality: - name: Check Code Quality - runs-on: ubuntu-latest - steps: - - name: Checkout repository - uses: actions/checkout@v1 - - - name: Install Python - uses: actions/setup-python@v1 - with: - python-version: 3.6 - architecture: "x64" - - - name: Install dependencies - run: pip install black==20.8b1 - - - name: Check style - working-directory: ./bindings/python - run: make check-style - build_win_32: name: Check it builds for Windows 32-bit runs-on: windows-latest @@ -115,11 +95,23 @@ jobs: python-version: 3.6 architecture: "x64" - - name: Run tests + - name: Install working-directory: ./bindings/python run: | python -m venv .env source .env/bin/activate pip install pytest requests setuptools_rust numpy python setup.py develop + + - name: Check style + working-directory: ./bindings/python + run: | + source .env/bin/activate + pip install black==20.8b1 + make check-style + + - name: Run tests + working-directory: ./bindings/python + run: | + source .env/bin/activate make test diff --git a/bindings/python/Makefile b/bindings/python/Makefile index f9d4e04c..7ad70943 100644 --- a/bindings/python/Makefile +++ b/bindings/python/Makefile @@ -6,10 +6,12 @@ dir_guard=@mkdir -p $(@D) # Format source code automatically style: + python stub.py black --line-length 100 --target-version py35 examples py_src/tokenizers tests # Check the source code is formatted correctly check-style: + python stub.py --check black --check --line-length 100 --target-version py35 examples py_src/tokenizers tests TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 4331b572..118f42e0 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -1,541 +1,6 @@ -from .decoders import * -from .models import * -from .normalizers import * -from .pre_tokenizers import * -from .processors import * -from .trainers import * - -from .implementations import ( - ByteLevelBPETokenizer as ByteLevelBPETokenizer, - CharBPETokenizer as CharBPETokenizer, - SentencePieceBPETokenizer as SentencePieceBPETokenizer, - BertWordPieceTokenizer as BertWordPieceTokenizer, -) - -from typing import Optional, Union, List, Tuple, Callable -from enum import Enum - -Offsets = Tuple[int, int] - -TextInputSequence = str -PreTokenizedInputSequence = Union[List[str], Tuple[str]] -TextEncodeInput = Union[TextInputSequence, Tuple[TextInputSequence, TextInputSequence]] -PreTokenizedEncodeInput = Union[ - PreTokenizedInputSequence, - Tuple[PreTokenizedInputSequence, PreTokenizedInputSequence], -] - -InputSequence = Union[TextInputSequence, PreTokenizedInputSequence] -EncodeInput = Union[TextEncodeInput, PreTokenizedEncodeInput] - -class OffsetReferential(Enum): - ORIGINAL = "original" - NORMALIZED = "normalized" - -class OffsetType(Enum): - BYTE = "byte" - CHAR = "char" - -class SplitDelimiterBehavior(Enum): - REMOVED = "removed" - ISOLATED = "isolated" - MERGED_WITH_PREVIOUS = "merged_with_previous" - MERGED_WITH_NEXT = "merged_with_next" - CONTIGUOUS = "contiguous" - -class Token: - id: int - token: str - offsets: Offsets - -Split = Tuple[str, Offsets, List[Token]] -Range = Union[int, Tuple[int, int], slice] -Pattern = Union[str, Regex] - -class PreTokenizedString: - """PreTokenizedString - - Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the - underlying string, while keeping track of the alignment information (offsets). - - The PreTokenizedString manages what we call `splits`. Each split represents a substring - which is a subpart of the original string, with the relevant offsets and tokens. - - When calling one of the methods used to modify the PreTokenizedString (namely one of - `split`, `normalize` or `tokenize), only the `splits` that don't have any associated - tokens will get modified. - """ - - def __new__(sequence: str) -> PreTokenizedString: - """Instantiate a new PreTokenizedString using the given str - - Args: - sequence: str: - The string sequence used to initialize this PreTokenizedString - """ - pass - def split(self, func: Callable[[index, NormalizedString], List[NormalizedString]]): - """Split the PreTokenizedString using the given `func` - - Args: - func: Callable[[index, NormalizedString], List[NormalizedString]]: - The function used to split each underlying split. - It is expected to return a list of `NormalizedString`, that represent the new - splits. If the given `NormalizedString` does not need any splitting, we can - just return it directly. - In order for the offsets to be tracked accurately, any returned `NormalizedString` - should come from calling either `.split` or `.slice` on the received one. - """ - pass - def normalize(self, func: Callable[[NormalizedString], None]): - """Normalize each split of the `PreTokenizedString` using the given `func` - - Args: - func: Callable[[NormalizedString], None]: - The function used to normalize each underlying split. This function - does not need to return anything, just calling the methods on the provided - NormalizedString allow its modification. - """ - pass - def tokenize(self, func: Callable[[str], List[Token]]): - """Tokenize each split of the `PreTokenizedString` using the given `func` - - Args: - func: Callable[[str], List[Token]]: - The function used to tokenize each underlying split. This function must return - a list of Token generated from the input str. - """ - pass - def to_encoding(self, type_id: int = 0, word_idx: Optional[int] = None) -> Encoding: - """Return an Encoding generated from this PreTokenizedString - - Args: - type_id: int = 0: - The type_id to be used on the generated Encoding. - - word_idx: Optional[int] = None: - An optional word index to be used for each token of this Encoding. If provided, - all the word indices in the generated Encoding will use this value, instead - of the one automatically tracked during pre-tokenization. - - Returns: - An Encoding - """ - pass - def get_splits( - self, - offset_referential: OffsetReferential = OffsetReferential.ORIGINAL, - offset_type: OffsetType = OffsetType.CHAR, - ) -> List[Split]: - """Get the splits currently managed by the PreTokenizedString - - Args: - offset_referential: OffsetReferential: - Whether the returned splits should have offsets expressed relative - to the original string, or the normalized one. - - offset_type: OffsetType: - Whether the returned splits should have offsets expressed in bytes or chars. - When slicing an str, we usually want to use chars, which is the default value. - Now in some cases it might be interesting to get these offsets expressed in bytes, - so it is possible to change this here. - - Returns - A list of splits - """ - pass - -class NormalizedString: - """NormalizedString - - A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one. - While making all the requested modifications, it keeps track of the alignment information - between the two versions of the string. - """ - - def __new__(sequence: str) -> NormalizedString: - """Instantiate a new NormalizedString using the given str - - Args: - sequence: str: - The string sequence used to initialize this NormalizedString - """ - pass - @property - def normalized(self) -> str: - """ The normalized part of the string """ - pass - @property - def original(self) -> str: - """ The original part of the string """ - pass - def nfd(self): - """ Runs the NFD normalization """ - pass - def nfkd(self): - """ Runs the NFKD normalization """ - pass - def nfc(self): - """ Runs the NFC normalization """ - pass - def nfkc(self): - """ Runs the NFKC normalization """ - pass - def lowercase(self): - """ Lowercase the string """ - pass - def uppercase(self): - """ Uppercase the string """ - pass - def prepend(self, s: str): - """ Prepend the given sequence to the string """ - pass - def append(self, s: str): - """ Append the given sequence to the string """ - pass - def lstrip(self): - """ Strip the left of the string """ - pass - def rstrip(self): - """ Strip the right of the string """ - pass - def strip(self): - """ Strip both ends of the string """ - pass - def clear(self): - """ Clear the string """ - pass - def slice(self, range: Range) -> Optional[NormalizedString]: - """ Slice the string using the given range """ - pass - def filter(self, func: Callable[[str], bool]): - """ Filter each character of the string using the given func """ - pass - def for_each(self, func: Callable[[str], None]): - """ Calls the given function for each character of the string """ - pass - def map(self, func: Callable[[str], str]): - """Calls the given function for each character of the string - - Replaces each character of the string using the returned value. Each - returned value **must** be a str of length 1 (ie a character). - """ - pass - def split(self, pattern: Pattern, behavior: SplitDelimiterBehavior) -> List[NormalizedString]: - """Split the NormalizedString using the given pattern and the specified behavior - - Args: - pattern: Pattern: - A pattern used to split the string. Usually a string or a Regex - - behavior: SplitDelimiterBehavior: - The behavior to use when splitting - - Returns: - A list of NormalizedString, representing each split - """ - pass - def replace(self, pattern: Pattern, content: str): - """Replace the content of the given pattern with the provided content - - Args: - pattern: Pattern: - A pattern used to match the string. Usually a string or a Regex - - content: str: - The content to be used as replacement - """ - pass - -class Regex: - """ A Regex """ - - def __new__(pattern: str) -> Regex: - """ Instantiate a new Regex with the given pattern """ - pass - -class Encoding: - """ - The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`. - """ - - @staticmethod - def merge(encodings: List[Encoding], growing_offsets: bool = True) -> Encoding: - """Merge the list of encodings into one final :class:`~tokenizers.Encoding` - - Args: - encodings (A :obj:`List` of :class:`~tokenizers.Encoding`): - The list of encodings that should be merged in one - - growing_offsets (:obj:`bool`, defaults to :obj:`True`): - Whether the offsets should accumulate while merging - - Returns: - :class:`~tokenizers.Encoding`: The resulting Encoding - """ - pass - @property - def n_sequences(self) -> int: - """The number of sequences represented - - Returns: - :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding` - """ - pass - def set_sequence_id(self, sequence_index: int): - """Set the given sequence index - - Set the given sequence index for the whole range of tokens contained in this - :class:`~tokenizers.Encoding`. - """ - pass - @property - def ids(self) -> List[int]: - """The generated IDs - - The IDs are the main input to a Language Model. They are the token indices, - the numerical representations that a LM understands. - - Returns: - :obj:`List[int]`: The list of IDs - """ - pass - @property - def tokens(self) -> List[str]: - """The generated tokens - - They are the string representation of the IDs. - - Returns: - :obj:`List[str]`: The list of tokens - """ - pass - @property - def words(self) -> List[Optional[int]]: - """The generated word indices. - - They represent the index of the word associated to each token. - When the input is pre-tokenized, they correspond to the ID of the given input label, - otherwise they correspond to the words indices as defined by the - :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used. - - For special tokens and such (any token that was generated from something that was - not part of the input), the output is :obj:`None` - - Returns: - A :obj:`List` of :obj:`Optional[int]`: A list of optional word index. - """ - pass - @property - def sequences(self) -> List[Optional[int]]: - """The generated sequence indices. - - They represent the index of the input sequence associated to each token. - The sequence id can be None if the token is not related to any input sequence, - like for example with special tokens. - - Returns: - A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index. - """ - @property - def type_ids(self) -> List[int]: - """The generated type IDs - - Generally used for tasks like sequence classification or question answering, - these tokens let the LM know which input sequence corresponds to each tokens. - - Returns: - :obj:`List[int]`: The list of type ids - """ - pass - @property - def offsets(self) -> List[Offsets]: - """The offsets associated to each token - - These offsets let's you slice the input string, and thus retrieve the original - part that led to producing the corresponding token. - - Returns: - A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets - """ - pass - @property - def special_tokens_mask(self) -> List[int]: - """The special token mask - - This indicates which tokens are special tokens, and which are not. - - Returns: - :obj:`List[int]`: The special tokens mask - """ - pass - @property - def attention_mask(self) -> List[int]: - """The attention mask - - This indicates to the LM which tokens should be attended to, and which should not. - This is especially important when batching sequences, where we need to applying - padding. - - Returns: - :obj:`List[int]`: The attention mask - """ - pass - @property - def overflowing(self) -> Optional[Encoding]: - """A :obj:`List` of overflowing :class:`~tokenizers.Encoding` - - When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting - the output into as many pieces as required to match the specified maximum length. - This field lets you retrieve all the subsequent pieces. - - When you use pairs of sequences, the overflowing pieces will contain enough - variations to cover all the possible combinations, while respecting the provided - maximum length. - """ - pass - def word_to_tokens(self, word_index: int, sequence_index: int = 0) -> Optional[Tuple[int, int]]: - """Get the encoded tokens corresponding to the word at the given index - in one of the input sequences. - - Args: - word_index (:obj:`int`): - The index of a word in one of the input sequences. - sequence_index (:obj:`int`, defaults to :obj:`0`): - The index of the sequence that contains the target word - - Returns: - :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` - """ - pass - def word_to_chars(self, word_index: int, sequence_index: int = 0) -> Optional[Offsets]: - """Get the offsets of the word at the given index in one of the input sequences. - - Args: - word_index (:obj:`int`): - The index of a word in one of the input sequences. - sequence_index (:obj:`int`, defaults to :obj:`0`): - The index of the sequence that contains the target word - - Returns: - :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` - """ - pass - def token_to_sequence(self, token_index: int) -> Optional[int]: - """Get the index of the sequence represented by the given token. - - In the general use case, this method returns :obj:`0` for a single sequence or - the first sequence of a pair, and :obj:`1` for the second sequence of a pair - - Args: - token_index (:obj:`int`): - The index of a token in the encoded sequence. - - Returns: - :obj:`int`: The sequence id of the given token - """ - pass - def token_to_chars(self, token_index: int) -> Optional[Offsets]: - """Get the offsets of the token at the given index. - - The returned offsets are related to the input sequence that contains the - token. In order to determine in which input sequence it belongs, you - must call :meth:`~tokenizers.Encoding.token_to_sequence()`. - - Args: - token_index (:obj:`int`): - The index of a token in the encoded sequence. - - Returns: - :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` - """ - pass - def token_to_word(self, token_index: int) -> Optional[int]: - """Get the index of the word that contains the token in one of the input sequences. - - The returned word index is related to the input sequence that contains - the token. In order to determine in which input sequence it belongs, you - must call :meth:`~tokenizers.Encoding.token_to_sequence()`. - - Args: - token_index (:obj:`int`): - The index of a token in the encoded sequence. - - Returns: - :obj:`int`: The index of the word in the relevant input sequence. - """ - pass - def char_to_token(self, pos: int, sequence_index: int = 0) -> Optional[int]: - """Get the token that contains the char at the given position in the input sequence. - - Args: - char_pos (:obj:`int`): - The position of a char in the input string - sequence_index (:obj:`int`, defaults to :obj:`0`): - The index of the sequence that contains the target char - - Returns: - :obj:`int`: The index of the token that contains this char in the encoded sequence - """ - pass - def char_to_word(self, pos: int, sequence_index: int = 0) -> Optional[int]: - """Get the word that contains the char at the given position in the input sequence. - - Args: - char_pos (:obj:`int`): - The position of a char in the input string - sequence_index (:obj:`int`, defaults to :obj:`0`): - The index of the sequence that contains the target char - - Returns: - :obj:`int`: The index of the word that contains this char in the input sequence - """ - pass - def pad( - self, - length: int, - pad_id: Optional[int] = 0, - pad_type_id: Optional[int] = 0, - pad_token: Optional[str] = "[PAD]", - direction: Optional[str] = "right", - ): - """Pad the :class:`~tokenizers.Encoding` at the given length - - Args: - length (:obj:`int`): - The desired length - - direction: (:obj:`str`, defaults to :obj:`right`): - The expected padding direction. Can be either :obj:`right` or :obj:`left` - - pad_id (:obj:`int`, defaults to :obj:`0`): - The ID corresponding to the padding token - - pad_type_id (:obj:`int`, defaults to :obj:`0`): - The type ID corresponding to the padding token - - pad_token (:obj:`str`, defaults to `[PAD]`): - The pad token to use - """ - pass - def truncate(self, max_length: int, stride: Optional[int] = 0): - """Truncate the :class:`~tokenizers.Encoding` at the given length - - If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating - this information is lost. It will be considered as representing a single sequence. - - Args: - max_length (:obj:`int`): - The desired length - - stride (:obj:`int`, defaults to :obj:`0`): - The length of previous content to be included in each overflowing piece - """ - pass - +# Generated content DO NOT EDIT class AddedToken: - """AddedToken - + """ Represents a token that can be be added to a :class:`~tokenizers.Tokenizer`. It can have special options that defines the way it should behave. @@ -565,239 +30,671 @@ class AddedToken: text. For example, with the added token ``"yesterday"``, and a normalizer in charge of lowercasing the text, the token could be extract from the input ``"I saw a lion Yesterday"``. + """ - def __new__( - cls, - content: str = "", - single_word: bool = False, - lstrip: bool = False, - rstrip: bool = False, - normalized: bool = True, - ) -> AddedToken: - """Instantiate a new AddedToken - - Args: - content (:obj:`str`): The content of the token - - single_word (:obj:`bool`, defaults to :obj:`False`): - Defines whether this token should only match single words. If :obj:`True`, this - token will never match inside of a word. For example the token ``ing`` would match - on ``tokenizing`` if this option is :obj:`False`, but not if it is :obj:`True`. - The notion of "`inside of a word`" is defined by the word boundaries pattern in - regular expressions (ie. the token should start and end with word boundaries). - - lstrip (:obj:`bool`, defaults to :obj:`False`): - Defines whether this token should strip all potential whitespaces on its left side. - If :obj:`True`, this token will greedily match any whitespace on its left. For - example if we try to match the token ``[MASK]`` with ``lstrip=True``, in the text - ``"I saw a [MASK]"``, we would match on ``" [MASK]"``. (Note the space on the left). - - rstrip (:obj:`bool`, defaults to :obj:`False`): - Defines whether this token should strip all potential whitespaces on its right - side. If :obj:`True`, this token will greedily match any whitespace on its right. - It works just like :obj:`lstrip` but on the right. - - normalized (:obj:`bool`, defaults to :obj:`True` with :meth:`~tokenizers.Tokenizer.add_tokens` and :obj:`False` with :meth:`~tokenizers.Tokenizer.add_special_tokens`): - Defines whether this token should match against the normalized version of the input - text. For example, with the added token ``"yesterday"``, and a normalizer in charge of - lowercasing the text, the token could be extract from the input ``"I saw a lion - Yesterday"``. + def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True): + pass + @property + def content(self): + """ + Get the content of this :obj:`AddedToken` + """ + pass + @property + def lstrip(self): + """ + Get the value of the :obj:`lstrip` option + """ + pass + @property + def normalized(self): + """ + Get the value of the :obj:`normalized` option + """ + pass + @property + def rstrip(self): + """ + Get the value of the :obj:`rstrip` option + """ + pass + @property + def single_word(self): + """ + Get the value of the :obj:`single_word` option """ pass -class Tokenizer: - """Tokenizer +class Encoding: + """ + The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`. + """ + @property + def attention_mask(self): + """ + The attention mask + + This indicates to the LM which tokens should be attended to, and which should not. + This is especially important when batching sequences, where we need to applying + padding. + + Returns: + :obj:`List[int]`: The attention mask + """ + pass + def char_to_token(self, char_pos, sequence_index=0): + """ + Get the token that contains the char at the given position in the input sequence. + + Args: + char_pos (:obj:`int`): + The position of a char in the input string + sequence_index (:obj:`int`, defaults to :obj:`0`): + The index of the sequence that contains the target char + + Returns: + :obj:`int`: The index of the token that contains this char in the encoded sequence + """ + pass + def char_to_word(self, char_pos, sequence_index=0): + """ + Get the word that contains the char at the given position in the input sequence. + + Args: + char_pos (:obj:`int`): + The position of a char in the input string + sequence_index (:obj:`int`, defaults to :obj:`0`): + The index of the sequence that contains the target char + + Returns: + :obj:`int`: The index of the word that contains this char in the input sequence + """ + pass + @property + def ids(self): + """ + The generated IDs + + The IDs are the main input to a Language Model. They are the token indices, + the numerical representations that a LM understands. + + Returns: + :obj:`List[int]`: The list of IDs + """ + pass + @staticmethod + def merge(encodings, growing_offsets=True): + """ + Merge the list of encodings into one final :class:`~tokenizers.Encoding` + + Args: + encodings (A :obj:`List` of :class:`~tokenizers.Encoding`): + The list of encodings that should be merged in one + + growing_offsets (:obj:`bool`, defaults to :obj:`True`): + Whether the offsets should accumulate while merging + + Returns: + :class:`~tokenizers.Encoding`: The resulting Encoding + """ + pass + @property + def n_sequences(self): + """ + The number of sequences represented + + Returns: + :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding` + """ + pass + @property + def offsets(self): + """ + The offsets associated to each token + + These offsets let's you slice the input string, and thus retrieve the original + part that led to producing the corresponding token. + + Returns: + A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets + """ + pass + @property + def overflowing(self): + """ + A :obj:`List` of overflowing :class:`~tokenizers.Encoding` + + When using truncation, the :class:`~tokenizers.Tokenizer` takes care of splitting + the output into as many pieces as required to match the specified maximum length. + This field lets you retrieve all the subsequent pieces. + + When you use pairs of sequences, the overflowing pieces will contain enough + variations to cover all the possible combinations, while respecting the provided + maximum length. + """ + pass + def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"): + """ + Pad the :class:`~tokenizers.Encoding` at the given length + + Args: + length (:obj:`int`): + The desired length + + direction: (:obj:`str`, defaults to :obj:`right`): + The expected padding direction. Can be either :obj:`right` or :obj:`left` + + pad_id (:obj:`int`, defaults to :obj:`0`): + The ID corresponding to the padding token + + pad_type_id (:obj:`int`, defaults to :obj:`0`): + The type ID corresponding to the padding token + + pad_token (:obj:`str`, defaults to `[PAD]`): + The pad token to use + """ + pass + @property + def sequences(self): + """ + The generated sequence indices. + + They represent the index of the input sequence associated to each token. + The sequence id can be None if the token is not related to any input sequence, + like for example with special tokens. + + Returns: + A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index. + """ + pass + def set_sequence_id(self, sequence_id): + """ + Set the given sequence index + + Set the given sequence index for the whole range of tokens contained in this + :class:`~tokenizers.Encoding`. + """ + pass + @property + def special_tokens_mask(self): + """ + The special token mask + + This indicates which tokens are special tokens, and which are not. + + Returns: + :obj:`List[int]`: The special tokens mask + """ + pass + def token_to_chars(self, token_index): + """ + Get the offsets of the token at the given index. + + The returned offsets are related to the input sequence that contains the + token. In order to determine in which input sequence it belongs, you + must call :meth:`~tokenizers.Encoding.token_to_sequence()`. + + Args: + token_index (:obj:`int`): + The index of a token in the encoded sequence. + + Returns: + :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` + """ + pass + def token_to_sequence(self, token_index): + """ + Get the index of the sequence represented by the given token. + + In the general use case, this method returns :obj:`0` for a single sequence or + the first sequence of a pair, and :obj:`1` for the second sequence of a pair + + Args: + token_index (:obj:`int`): + The index of a token in the encoded sequence. + + Returns: + :obj:`int`: The sequence id of the given token + """ + pass + def token_to_word(self, token_index): + """ + Get the index of the word that contains the token in one of the input sequences. + + The returned word index is related to the input sequence that contains + the token. In order to determine in which input sequence it belongs, you + must call :meth:`~tokenizers.Encoding.token_to_sequence()`. + + Args: + token_index (:obj:`int`): + The index of a token in the encoded sequence. + + Returns: + :obj:`int`: The index of the word in the relevant input sequence. + """ + pass + @property + def tokens(self): + """ + The generated tokens + + They are the string representation of the IDs. + + Returns: + :obj:`List[str]`: The list of tokens + """ + pass + def truncate(self, max_length, stride=0): + """ + Truncate the :class:`~tokenizers.Encoding` at the given length + + If this :class:`~tokenizers.Encoding` represents multiple sequences, when truncating + this information is lost. It will be considered as representing a single sequence. + + Args: + max_length (:obj:`int`): + The desired length + + stride (:obj:`int`, defaults to :obj:`0`): + The length of previous content to be included in each overflowing piece + """ + pass + @property + def type_ids(self): + """ + The generated type IDs + + Generally used for tasks like sequence classification or question answering, + these tokens let the LM know which input sequence corresponds to each tokens. + + Returns: + :obj:`List[int]`: The list of type ids + """ + pass + def word_to_chars(self, word_index, sequence_index=0): + """ + Get the offsets of the word at the given index in one of the input sequences. + + Args: + word_index (:obj:`int`): + The index of a word in one of the input sequences. + sequence_index (:obj:`int`, defaults to :obj:`0`): + The index of the sequence that contains the target word + + Returns: + :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` + """ + pass + def word_to_tokens(self, word_index, sequence_index=0): + """ + Get the encoded tokens corresponding to the word at the given index + in one of the input sequences. + + Args: + word_index (:obj:`int`): + The index of a word in one of the input sequences. + sequence_index (:obj:`int`, defaults to :obj:`0`): + The index of the sequence that contains the target word + + Returns: + :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` + """ + pass + @property + def words(self): + """ + The generated word indices. + + They represent the index of the word associated to each token. + When the input is pre-tokenized, they correspond to the ID of the given input label, + otherwise they correspond to the words indices as defined by the + :class:`~tokenizers.pre_tokenizers.PreTokenizer` that was used. + + For special tokens and such (any token that was generated from something that was + not part of the input), the output is :obj:`None` + + Returns: + A :obj:`List` of :obj:`Optional[int]`: A list of optional word index. + """ + pass + +class NormalizedString: + """ + NormalizedString + + A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one. + While making all the requested modifications, it keeps track of the alignment information + between the two versions of the string. + + Args: + sequence: str: + The string sequence used to initialize this NormalizedString + """ + + def append(self, s): + """ + Append the given sequence to the string + """ + pass + def clear(self): + """ + Clears the string + """ + pass + def filter(self, func): + """ + Filter each character of the string using the given func + """ + pass + def for_each(self, func): + """ + Calls the given function for each character of the string + """ + pass + def lowercase(self): + """ + Lowercase the string + """ + pass + def lstrip(self): + """ + Strip the left of the string + """ + pass + def map(self, func): + """ + Calls the given function for each character of the string + + Replaces each character of the string using the returned value. Each + returned value **must** be a str of length 1 (ie a character). + """ + pass + def nfc(self): + """ + Runs the NFC normalization + """ + pass + def nfd(self): + """ + Runs the NFD normalization + """ + pass + def nfkc(self): + """ + Runs the NFKC normalization + """ + pass + def nfkd(self): + """ + Runs the NFKD normalization + """ + pass + @property + def normalized(self): + """ + The normalized part of the string + """ + pass + def prepend(self, s): + """ + Prepend the given sequence to the string + """ + pass + def replace(self, pattern, content): + """ + Replace the content of the given pattern with the provided content + + Args: + pattern: Pattern: + A pattern used to match the string. Usually a string or a Regex + + content: str: + The content to be used as replacement + """ + pass + def rstrip(self): + """ + Strip the right of the string + """ + pass + def slice(self, range): + """ + Slice the string using the given range + """ + pass + def split(self, pattern, behavior): + """ + Split the NormalizedString using the given pattern and the specified behavior + + Args: + pattern: Pattern: + A pattern used to split the string. Usually a string or a Regex + + behavior: SplitDelimiterBehavior: + The behavior to use when splitting. + Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", + "contiguous" + + Returns: + A list of NormalizedString, representing each split + """ + pass + def strip(self): + """ + Strip both ends of the string + """ + pass + def uppercase(self): + """ + Uppercase the string + """ + pass + +class PreTokenizedString: + """ + PreTokenizedString + + Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the + underlying string, while keeping track of the alignment information (offsets). + + The PreTokenizedString manages what we call `splits`. Each split represents a substring + which is a subpart of the original string, with the relevant offsets and tokens. + + When calling one of the methods used to modify the PreTokenizedString (namely one of + `split`, `normalize` or `tokenize), only the `splits` that don't have any associated + tokens will get modified. + + Args: + sequence: str: + The string sequence used to initialize this PreTokenizedString + """ + + def __init__(self, sequence): + pass + def get_splits(self, offset_referential="original", offset_type="char"): + """ + Get the splits currently managed by the PreTokenizedString + + Args: + offset_referential: :obj:`str` + Whether the returned splits should have offsets expressed relative + to the original string, or the normalized one. choices: "original", "normalized". + + offset_type: :obj:`str` + Whether the returned splits should have offsets expressed in bytes or chars. + When slicing an str, we usually want to use chars, which is the default value. + Now in some cases it might be interesting to get these offsets expressed in bytes, + so it is possible to change this here. + choices: "char", "bytes" + + Returns + A list of splits + """ + pass + def normalize(self, func): + """ + Normalize each split of the `PreTokenizedString` using the given `func` + + Args: + func: Callable[[NormalizedString], None]: + The function used to normalize each underlying split. This function + does not need to return anything, just calling the methods on the provided + NormalizedString allow its modification. + """ + pass + def split(self, func): + """ + Split the PreTokenizedString using the given `func` + + Args: + func: Callable[[index, NormalizedString], List[NormalizedString]]: + The function used to split each underlying split. + It is expected to return a list of `NormalizedString`, that represent the new + splits. If the given `NormalizedString` does not need any splitting, we can + just return it directly. + In order for the offsets to be tracked accurately, any returned `NormalizedString` + should come from calling either `.split` or `.slice` on the received one. + """ + pass + def to_encoding(self, type_id=0, word_idx=None): + """ + Return an Encoding generated from this PreTokenizedString + + Args: + type_id: int = 0: + The type_id to be used on the generated Encoding. + + word_idx: Optional[int] = None: + An optional word index to be used for each token of this Encoding. If provided, + all the word indices in the generated Encoding will use this value, instead + of the one automatically tracked during pre-tokenization. + + Returns: + An Encoding + """ + pass + def tokenize(self, func): + """ + Tokenize each split of the `PreTokenizedString` using the given `func` + + Args: + func: Callable[[str], List[Token]]: + The function used to tokenize each underlying split. This function must return + a list of Token generated from the input str. + """ + pass + +class Regex: + """ + Instantiate a new Regex with the given pattern + """ + + def __init__(self, pattern): + pass + +class Token: + pass + +class Tokenizer: + """ A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input and outputs an :class:`~tokenizers.Encoding`. Args: model (:class:`~tokenizers.models.Model`): The core algorithm that this :obj:`Tokenizer` should be using. + """ - def __new__(cls, model: models.Model) -> Tokenizer: - """Instantiate a new Tokenizer using the given Model + def __init__(self, model): + pass + def add_special_tokens(self, tokens): + """ + Add the given special tokens to the Tokenizer. - A :obj:`Tokenizer` works as a pipeline. It processes some raw text as input - and outputs an :class:`~tokenizers.Encoding`. + If these tokens are already part of the vocabulary, it just let the Tokenizer know about + them. If they don't exist, the Tokenizer creates them, giving them a new id. + + These special tokens will never be processed by the model (ie won't be split into + multiple tokens), and they can be removed from the output when decoding. Args: - model (:class:`~tokenizers.models.Model`): - The core algorithm that this :obj:`Tokenizer` should be using. + tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`): + The list of special tokens we want to add to the vocabulary. Each token can either + be a string or an instance of :class:`~tokenizers.AddedToken` for more + customization. Returns: - Tokenizer + :obj:`int`: The number of tokens that were created in the vocabulary """ pass - @staticmethod - def from_str(s: str) -> Tokenizer: - """Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string. + def add_tokens(self, tokens): + """ + Add the given tokens to the vocabulary + + The given tokens are added only if they don't already exist in the vocabulary. + Each token then gets a new attributed id. Args: - json (:obj:`str`): - A valid JSON string representing a previously serialized - :class:`~tokenizers.Tokenizer` + tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`): + The list of tokens we want to add to the vocabulary. Each token can be either a + string or an instance of :class:`~tokenizers.AddedToken` for more customization. Returns: - :class:`~tokenizers.Tokenizer`: The new tokenizer + :obj:`int`: The number of tokens that were created in the vocabulary """ pass - @staticmethod - def from_file(path: str) -> Tokenizer: - """Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path. + def decode(self, ids, skip_special_tokens=True): + """ + Decode the given list of ids back to a string + + This is used to decode anything coming back from a Language Model Args: - path (:obj:`str`): - A path to a local JSON file representing a previously serialized - :class:`~tokenizers.Tokenizer` + ids (A :obj:`List/Tuple` of :obj:`int`): + The list of ids that we want to decode + + skip_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether the special tokens should be removed from the decoded string Returns: - :class:`~tokenizers.Tokenizer`: The new tokenizer + :obj:`str`: The decoded string """ pass - @staticmethod - def from_buffer(buffer: bytes) -> Tokenizer: - """Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer. + def decode_batch(self, sequences, skip_special_tokens=True): + """ + Decode a batch of ids back to their corresponding string Args: - buffer (:obj:`bytes`): - A buffer containing a previously serialized :class:`~tokenizers.Tokenizer` + sequences (:obj:`List` of :obj:`List[int]`): + The batch of sequences we want to decode + + skip_special_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether the special tokens should be removed from the decoded strings Returns: - :class:`~tokenizers.Tokenizer`: The new tokenizer - """ - pass - def to_str(self, pretty: bool = False) -> str: - """Gets a serialized string representing this :class:`~tokenizers.Tokenizer`. - - Args: - pretty (:obj:`bool`, defaults to :obj:`False`): - Whether the JSON string should be pretty formatted. - - Returns: - :obj:`str`: A string representing the serialized Tokenizer - """ - pass - def save(self, path: str, pretty: bool = False): - """Save the :class:`~tokenizers.Tokenizer` to the file at the given path. - - Args: - path (:obj:`str`): - A path to a file in which to save the serialized tokenizer. - - pretty (:obj:`bool`, defaults to :obj:`False`): - Whether the JSON file should be pretty formatted. + :obj:`List[str]`: A list of decoded strings """ pass @property - def model(self) -> Model: - """ Get the model in use with this Tokenizer """ - pass - @model.setter - def model(self, model: models.Model): - """ Change the model to use with this Tokenizer """ - pass - @property - def pre_tokenizer(self) -> Optional[PreTokenizer]: - """ Get the pre-tokenizer in use with this model """ - pass - @pre_tokenizer.setter - def pre_tokenizer(self, pre_tokenizer: pre_tokenizers.PreTokenizer): - """ Change the pre tokenizer to use with this Tokenizer """ - pass - @property - def decoder(self) -> Optional[Decoder]: - """ Get the decoder in use with this model """ - pass - @decoder.setter - def decoder(self, decoder: decoders.Decoder): - """ Change the decoder to use with this Tokenizer """ - pass - @property - def post_processor(self) -> Optional[PostProcessor]: - """ Get the post-processor in use with this Tokenizer """ - pass - @post_processor.setter - def post_processor(self, processor: processors.PostProcessor): - """ Change the post processor to use with this Tokenizer """ - @property - def normalizer(self) -> Optional[Normalizer]: - """ Get the normalizer in use with this Tokenizer """ - pass - @normalizer.setter - def normalizer(self, normalizer: normalizers.Normalizer): - """ Change the normalizer to use with this Tokenizer """ - def num_special_tokens_to_add(self, is_pair: bool) -> int: + def decoder(self): """ - Return the number of special tokens that would be added for single/pair sentences. - :param is_pair: Boolean indicating if the input would be a single sentence or a pair - :return: - """ - pass - def get_vocab(self, with_added_tokens: bool = True) -> Dict[str, int]: - """Get the underlying vocabulary - - Args: - with_added_tokens (:obj:`bool`, defaults to :obj:`True`): - Whether to include the added tokens - - Returns: - :obj:`Dict[str, int]`: The vocabulary - """ - pass - def get_vocab_size(self, with_added_tokens: bool = True) -> int: - """Get the size of the underlying vocabulary - - Args: - with_added_tokens (:obj:`bool`, defaults to :obj:`True`): - Whether to include the added tokens - - Returns: - :obj:`int`: The size of the vocabulary - """ - pass - def enable_truncation(self, max_length: int, stride: Optional[int], strategy: Optional[str]): - """Enable truncation - - Args: - max_length (:obj:`int`): - The max length at which to truncate - - stride (:obj:`int`, `optional`): - The length of the previous first sequence to be included in the overflowing - sequence - - strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`): - The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or - ``only_second``. - """ - pass - def no_truncation(self): - """ Disable truncation """ - pass - @property - def truncation(self) -> Optional[dict]: - """Get the currently set truncation parameters - - `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead` - - Returns: - (:obj:`dict`, `optional`): - A dict with the current truncation parameters if truncation is enabled + The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer """ pass def enable_padding( self, - direction: Optional[str] = "right", - pad_to_multiple_of: Optional[int] = None, - pad_id: Optional[int] = 0, - pad_type_id: Optional[int] = 0, - pad_token: Optional[str] = "[PAD]", - length: Optional[int] = None, + direction="right", + pad_id=0, + pad_type_id=0, + pad_token="[PAD]", + length=None, + pad_to_multiple_of=None, ): - """Enable the padding + """ + Enable the padding Args: direction (:obj:`str`, `optional`, defaults to :obj:`right`): @@ -822,28 +719,26 @@ class Tokenizer: the longest sequence in a batch. """ pass - def no_padding(self): - """ Disable padding """ - pass - @property - def padding(self) -> Optional[dict]: - """Get the current padding parameters + def enable_truncation(self, max_length, stride=0, strategy="longest_first"): + """ + Enable truncation - `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead` + Args: + max_length (:obj:`int`): + The max length at which to truncate - Returns: - (:obj:`dict`, `optional`): - A dict with the current padding parameters if padding is enabled + stride (:obj:`int`, `optional`): + The length of the previous first sequence to be included in the overflowing + sequence + + strategy (:obj:`str`, `optional`, defaults to :obj:`longest_first`): + The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or + ``only_second``. """ pass - def encode( - self, - sequence: InputSequence, - pair: Optional[InputSequence], - is_pretokenized: bool = False, - add_special_tokens: bool = True, - ) -> Encoding: - """Encode the given sequence and pair. This method can process raw text sequences + def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True): + """ + Encode the given sequence and pair. This method can process raw text sequences as well as already pre-tokenized sequences. Example: @@ -876,15 +771,12 @@ class Tokenizer: Returns: :class:`~tokenizers.Encoding`: The encoded result + """ pass - def encode_batch( - self, - inputs: List[EncodeInput], - is_pretokenized: bool = False, - add_special_tokens: bool = True, - ) -> List[Encoding]: - """Encode the given batch of inputs. This method accept both raw text sequences + def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True): + """ + Encode the given batch of inputs. This method accept both raw text sequences as well as already pre-tokenized sequences. Example: @@ -917,51 +809,74 @@ class Tokenizer: """ pass - def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str: - """Decode the given list of ids back to a string - - This is used to decode anything coming back from a Language Model + @staticmethod + def from_buffer(buffer): + """ + Instantiate a new :class:`~tokenizers.Tokenizer` from the given buffer. Args: - ids (A :obj:`List/Tuple` of :obj:`int`): - The list of ids that we want to decode - - skip_special_tokens (:obj:`bool`, defaults to :obj:`True`): - Whether the special tokens should be removed from the decoded string + buffer (:obj:`bytes`): + A buffer containing a previously serialized :class:`~tokenizers.Tokenizer` Returns: - :obj:`str`: The decoded string + :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass - def decode_batch( - self, sequences: List[List[int]], skip_special_tokens: Optional[bool] = True - ) -> str: - """Decode a batch of ids back to their corresponding string + @staticmethod + def from_file(path): + """ + Instantiate a new :class:`~tokenizers.Tokenizer` from the file at the given path. Args: - sequences (:obj:`List` of :obj:`List[int]`): - The batch of sequences we want to decode - - skip_special_tokens (:obj:`bool`, defaults to :obj:`True`): - Whether the special tokens should be removed from the decoded strings + path (:obj:`str`): + A path to a local JSON file representing a previously serialized + :class:`~tokenizers.Tokenizer` Returns: - :obj:`List[str]`: A list of decoded strings + :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass - def token_to_id(self, token: str) -> Optional[int]: - """Convert the given token to its corresponding id if it exists + @staticmethod + def from_str(json): + """ + Instantiate a new :class:`~tokenizers.Tokenizer` from the given JSON string. Args: - token (:obj:`str`): - The token to convert + json (:obj:`str`): + A valid JSON string representing a previously serialized + :class:`~tokenizers.Tokenizer` Returns: - :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary + :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass - def id_to_token(self, id: int) -> Optional[str]: - """Convert the given id to its corresponding token if it exists + def get_vocab(self, with_added_tokens=True): + """ + Get the underlying vocabulary + + Args: + with_added_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to include the added tokens + + Returns: + :obj:`Dict[str, int]`: The vocabulary + """ + pass + def get_vocab_size(self, with_added_tokens=True): + """ + Get the size of the underlying vocabulary + + Args: + with_added_tokens (:obj:`bool`, defaults to :obj:`True`): + Whether to include the added tokens + + Returns: + :obj:`int`: The size of the vocabulary + """ + pass + def id_to_token(self, id): + """ + Convert the given id to its corresponding token if it exists Args: id (:obj:`int`): @@ -971,47 +886,50 @@ class Tokenizer: :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary """ pass - def add_tokens(self, tokens: List[Union[str, AddedToken]]) -> int: - """Add the given tokens to the vocabulary - - The given tokens are added only if they don't already exist in the vocabulary. - Each token then gets a new attributed id. - - Args: - tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`): - The list of tokens we want to add to the vocabulary. Each token can be either a - string or an instance of :class:`~tokenizers.AddedToken` for more customization. - - Returns: - :obj:`int`: The number of tokens that were created in the vocabulary + @property + def model(self): + """ + The :class:`~tokenizers.models.Model` in use by the Tokenizer """ pass - def add_special_tokens(self, tokens: List[Union[str, AddedToken]]) -> int: - """Add the given special tokens to the Tokenizer. - - If these tokens are already part of the vocabulary, it just let the Tokenizer know about - them. If they don't exist, the Tokenizer creates them, giving them a new id. - - These special tokens will never be processed by the model (ie won't be split into - multiple tokens), and they can be removed from the output when decoding. - - Args: - tokens (A :obj:`List` of :class:`~tokenizers.AddedToken` or :obj:`str`): - The list of special tokens we want to add to the vocabulary. Each token can either - be a string or an instance of :class:`~tokenizers.AddedToken` for more - customization. - - Returns: - :obj:`int`: The number of tokens that were created in the vocabulary + def no_padding(self): + """ + Disable padding """ pass - def post_process( - self, - encoding: Encoding, - pair: Optional[Encoding] = None, - add_special_tokens: bool = True, - ) -> Encoding: - """Apply all the post-processing steps to the given encodings. + def no_truncation(self): + """ + Disable truncation + """ + pass + @property + def normalizer(self): + """ + The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer + """ + pass + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + :param is_pair: Boolean indicating if the input would be a single sentence or a pair + :return: + """ + pass + @property + def padding(self): + """ + Get the current padding parameters + + `Cannot be set, use` :meth:`~tokenizers.Tokenizer.enable_padding` `instead` + + Returns: + (:obj:`dict`, `optional`): + A dict with the current padding parameters if padding is enabled + """ + pass + def post_process(self, encoding, pair=None, add_special_tokens=True): + """ + Apply all the post-processing steps to the given encodings. The various steps are: @@ -1035,3 +953,63 @@ class Tokenizer: :class:`~tokenizers.Encoding`: The final post-processed encoding """ pass + @property + def post_processor(self): + """ + The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer + """ + pass + @property + def pre_tokenizer(self): + """ + The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer + """ + pass + def save(self, pretty=False): + """ + Save the :class:`~tokenizers.Tokenizer` to the file at the given path. + + Args: + path (:obj:`str`): + A path to a file in which to save the serialized tokenizer. + + pretty (:obj:`bool`, defaults to :obj:`False`): + Whether the JSON file should be pretty formatted. + """ + pass + def to_str(self, pretty=False): + """ + Gets a serialized string representing this :class:`~tokenizers.Tokenizer`. + + Args: + pretty (:obj:`bool`, defaults to :obj:`False`): + Whether the JSON string should be pretty formatted. + + Returns: + :obj:`str`: A string representing the serialized Tokenizer + """ + pass + def token_to_id(self, token): + """ + Convert the given token to its corresponding id if it exists + + Args: + token (:obj:`str`): + The token to convert + + Returns: + :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary + """ + pass + @property + def truncation(self): + """ + Get the currently set truncation parameters + + `Cannot set, use` :meth:`~tokenizers.Tokenizer.enable_truncation` `instead` + + Returns: + (:obj:`dict`, `optional`): + A dict with the current truncation parameters if truncation is enabled + """ + pass diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 8b0e1f34..84f1b352 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -1,65 +1,87 @@ -from typing import List - +# Generated content DO NOT EDIT class Decoder: - """Base class for all decoders + """ + Base class for all decoders This class is not supposed to be instantiated directly. Instead, any implementation of a Decoder will return an instance of this class when instantiated. """ - def decode(self, tokens: List[str]) -> str: - """ Decode the given list of string to a final string """ - pass - -class ByteLevel(Decoder): - """ ByteLevel Decoder """ - - def __init__(self) -> None: - """ Instantiate a new ByteLevel Decoder """ - pass - -class WordPiece(Decoder): - """ WordPiece Decoder """ - - @staticmethod - def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder: - """Instantiate a new WordPiece Decoder - - Args: - prefix: str: - The prefix to use for subwords that are not a beginning-of-word - cleanup: bool: - Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, - and some abbreviated english forms. + def decode(self, tokens): """ - pass - -class Metaspace(Decoder): - """ Metaspace decoder """ - - def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None: - """Instantiate a new Metaspace - - Args: - replacement: str: - The replacement character. Must be exactly one character. By default we - use the `▁` (U+2581) meta symbol (Same as in SentencePiece). - - add_prefix_space: boolean: - Whether to add a space to the first word if there isn't already one. This - lets us treat `hello` exactly like `say hello`. + Decode the given list of string to a final string """ pass class BPEDecoder(Decoder): - """ BPEDecoder """ + """ + Instantiate a new BPEDecoder - def __init__(self, suffix: str = "") -> None: - """Instantiate a new BPEDecoder + Args: + suffix: str: + The suffix that was used to caracterize an end-of-word. This suffix will + be replaced by whitespaces during the decoding + """ - Args: - suffix: str: - The suffix that was used to caracterize an end-of-word. This suffix will - be replaced by whitespaces during the decoding + def __init__(self, suffix=""): + pass + def decode(self, tokens): + """ + Decode the given list of string to a final string + """ + pass + +class ByteLevel(Decoder): + """ + ByteLevel Decoder + """ + + def __init__(self): + pass + def decode(self, tokens): + """ + Decode the given list of string to a final string + """ + pass + +class Metaspace(Decoder): + """ + Instantiate a new Metaspace + + Args: + replacement: str: + The replacement character. Must be exactly one character. By default we + use the `▁` (U+2581) meta symbol (Same as in SentencePiece). + + add_prefix_space: boolean: + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. + """ + + def __init__(self, replacement="▁", add_prefix_space=True): + pass + def decode(self, tokens): + """ + Decode the given list of string to a final string + """ + pass + +class WordPiece(Decoder): + """ + Instantiate a new WordPiece Decoder + + Args: + prefix: str: + The prefix to use for subwords that are not a beginning-of-word + cleanup: bool: + Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, + and some abbreviated english forms. + """ + + def __init__(self, prefix="##", cleanup=True): + pass + def decode(self, tokens): + """ + Decode the given list of string to a final string """ pass diff --git a/bindings/python/py_src/tokenizers/models/__init__.py b/bindings/python/py_src/tokenizers/models/__init__.py index ea74edfb..68ac211a 100644 --- a/bindings/python/py_src/tokenizers/models/__init__.py +++ b/bindings/python/py_src/tokenizers/models/__init__.py @@ -1,9 +1,8 @@ -from typing import List, Tuple - -from .. import models, Offsets +# Generated content DO NOT EDIT +from .. import models Model = models.Model BPE = models.BPE -WordPiece = models.WordPiece -WordLevel = models.WordLevel Unigram = models.Unigram +WordLevel = models.WordLevel +WordPiece = models.WordPiece diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi index 2b49496c..3aead6f1 100644 --- a/bindings/python/py_src/tokenizers/models/__init__.pyi +++ b/bindings/python/py_src/tokenizers/models/__init__.pyi @@ -1,34 +1,37 @@ -from .. import Encoding, Offsets, Token -from typing import List, Optional, Union, Tuple, Dict - +# Generated content DO NOT EDIT class Model: - """Base class for all models - - This class is not supposed to be instantiated directly. Instead, any implementation of - a Model will return a instance of this class when instantiated. + """ + A Model represents some tokenization algorithm like BPE or Word + This class cannot be constructed directly. Please use one of the concrete models. """ - def tokenize(self, sequence: str) -> List[Token]: - """ Tokenize the given sequence """ + def id_to_token(self, id): + """ + Returns the token associated with the given id + """ pass - def token_to_id(self, token: str) -> Optional[int]: - """ Returns the id associated with the given token """ - pass - def id_to_token(self, id: int) -> Optional[str]: - """ Returns the token associated with the given id """ - pass - def save(self, folder: str, name: Optional[str] = None) -> List[str]: - """Save the current model + def save(self, folder, name): + """ + Save the current model Save the current model in the given folder, using the given name for the various files that will get created. Any file with the same name that already exist in this folder will be overwritten. """ pass + def token_to_id(self, tokens): + """ + Returns the id associated with the given token + """ + pass + def tokenize(self, tokens): + """ + Tokenize the given sequence + """ + pass class BPE(Model): - """BytePairEncoding model class - + """ Instantiate a BPE Model from the given vocab and merges. Args: @@ -61,21 +64,18 @@ class BPE(Model): def __init__( self, - vocab: Optional[Union[str, Dict[str, int]]], - merges: Optional[Union[str, List[Tuple[str, str]]]], - cache_capacity: Optional[int], - dropout: Optional[float], - unk_token: Optional[str], - continuing_subword_prefix: Optional[str], - end_of_word_suffix: Optional[str], - fuse_unk: Optional[bool], + vocab=None, + merges=None, + cache_capacity=None, + dropout=None, + unk_token=None, + continuing_subword_prefix=None, + end_of_word_suffix=None, + fuse_unk=None, ): pass @staticmethod - def read_file(vocab_filename: str, merges_filename: str) -> Tuple[Vocab, Merges]: - pass - @staticmethod - def from_file(vocab_filename: str, merges_filename: str, **kwargs) -> BPE: + def from_file(vocab_filename, merge_filename, **kwargs): """ Convenient method to intialize a BPE from files Roughly equivalent to @@ -85,42 +85,73 @@ class BPE(Model): return BPE(vocab, merges, **kwargs) """ pass + def id_to_token(self, id): + """ + Returns the token associated with the given id + """ + pass + @staticmethod + def read_file(self, vocab_filename, merges_filename): + """ + Read a vocab_filename and merge_filename and stores result in memory + """ + pass + def save(self, folder, name): + """ + Save the current model -class WordPiece(Model): - """WordPiece model class + Save the current model in the given folder, using the given name for the various + files that will get created. + Any file with the same name that already exist in this folder will be overwritten. + """ + pass + def token_to_id(self, tokens): + """ + Returns the id associated with the given token + """ + pass + def tokenize(self, tokens): + """ + Tokenize the given sequence + """ + pass - Instantiate a WordPiece Model from the given vocab file. +class Unigram(Model): + """ + UnigramEncoding model class - Args: - vocab: (`optional`) string: - A dictionnary of string keys and their ids {"am": 0,...} + Instantiate a Unigram Model from the given model file. - unk_token: (`optional`) str: - The unknown token to be used by the model. + Args: + vocab: ('`optional`) string: + A list of vocabulary items and their relative score [("am", -0.2442),...] - max_input_chars_per_word: (`optional`) int: - The maximum number of characters to authorize in a single word. """ - def __init__( - self, - vocab: Optional[Union[str, Dict[str, int]]], - unk_token: Optional[str], - max_input_chars_per_word: Optional[int], - ): + def __init__(self, vocab): pass - @staticmethod - def read_file(vocab_filename: str) -> Vocab: - pass - @staticmethod - def from_file(vocab_filename: str, **kwargs) -> WordPiece: + def id_to_token(self, id): """ - Convenient method to intialize a WordPiece from file - Roughly equivalent to + Returns the token associated with the given id + """ + pass + def save(self, folder, name): + """ + Save the current model - def from_file(vocab_filename, **kwargs): - vocab, merges = WordPiece.read_file(vocab_filename) - return WordPiece(vocab, **kwargs) + Save the current model in the given folder, using the given name for the various + files that will get created. + Any file with the same name that already exist in this folder will be overwritten. + """ + pass + def token_to_id(self, tokens): + """ + Returns the id associated with the given token + """ + pass + def tokenize(self, tokens): + """ + Tokenize the given sequence """ pass @@ -138,34 +169,89 @@ class WordLevel(Model): The unknown token to be used by the model. """ - def __init__(self, vocab: Optional[Union[str, Dict[str, int]]], unk_token: Optional[str]): + def __init__(self, vocab, unk_token): pass - @staticmethod - def read_file(vocab_filename: str) -> Vocab: - pass - @staticmethod - def from_file(vocab_filename: str, **kwargs) -> WordLevelg: + def id_to_token(self, id): """ - Convenient method to intialize a WordLevelg from file + Returns the token associated with the given id + """ + pass + def save(self, folder, name): + """ + Save the current model + + Save the current model in the given folder, using the given name for the various + files that will get created. + Any file with the same name that already exist in this folder will be overwritten. + """ + pass + def token_to_id(self, tokens): + """ + Returns the id associated with the given token + """ + pass + def tokenize(self, tokens): + """ + Tokenize the given sequence + """ + pass + +class WordPiece(Model): + """ + WordPiece model + Instantiate a WordPiece Model from the given vocab file. + + Args: + vocab: (`optional`) string: + A dictionnary of string keys and their ids {"am": 0,...} + + unk_token: (`optional`) str: + The unknown token to be used by the model. + + max_input_chars_per_word: (`optional`) int: + The maximum number of characters to authorize in a single word. + """ + + def __init__(self, vocab, unk_token, max_input_chars_per_word): + pass + @staticmethod + def from_file(vocab_filename, merge_filename, **kwargs): + """ + Convenient method to intialize a WordPiece from files Roughly equivalent to def from_file(vocab_filename, **kwargs): - vocab, merges = WordLevelg.read_file(vocab_filename) - return WordLevelg(vocab, **kwargs) + vocab = WordPiece.read_file(vocab_filename) + return WordPiece(vocab, **kwargs) """ pass - -class Unigram(Model): - """UnigramEncoding model class - - Instantiate a Unigram Model from the given model file. - - Args: - vocab: ('`optional`) string: - A list of vocabulary items and their relative score [("am", -0.2442),...] - - """ - - @staticmethod - def __init__(self, vocab: Optional[List[Tuple[str, float]]]): + def id_to_token(self, id): + """ + Returns the token associated with the given id + """ + pass + @staticmethod + def read_file(vocab_filename): + """ + Read a vocab_filename and stores result in memory + """ + pass + def save(self, folder, name): + """ + Save the current model + + Save the current model in the given folder, using the given name for the various + files that will get created. + Any file with the same name that already exist in this folder will be overwritten. + """ + pass + def token_to_id(self, tokens): + """ + Returns the id associated with the given token + """ + pass + def tokenize(self, tokens): + """ + Tokenize the given sequence + """ pass diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi index 19d43889..1b9f8169 100644 --- a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi @@ -1,140 +1,258 @@ -from .. import NormalizedString -from typing import Optional, List - +# Generated content DO NOT EDIT class Normalizer: - """Base class for all normalizers + """ + Base class for all normalizers This class is not supposed to be instantiated directly. Instead, any implementation of a Normalizer will return an instance of this class when instantiated. """ - def normalize(self, normalized: NormalizedString): - """ Normalize the given NormalizedString in-place """ + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ pass - def normalize_str(self, sequence: str) -> str: - """ Normalize the given str """ - pass - -class BertNormalizer(Normalizer): - """BertNormalizer - - Takes care of normalizing raw text before giving it to a Bert model. - This includes cleaning the text, handling accents, chinese chars and lowercasing - """ - - def __init__( - self, - clean_text: Optional[bool] = True, - handle_chinese_chars: Optional[bool] = True, - strip_accents: Optional[bool] = None, - lowercase: Optional[bool] = True, - ) -> None: - """Instantiate a BertNormalizer with the given options. - - Args: - clean_text: (`optional`) boolean: - Whether to clean the text, by removing any control characters - and replacing all whitespaces by the classic one. - - handle_chinese_chars: (`optional`) boolean: - Whether to handle chinese chars by putting spaces around them. - - strip_accents: (`optional`) boolean: - Whether to strip all accents. If this option is not specified (ie == None), - then it will be determined by the value for `lowercase` (as in the original Bert). - - lowercase: (`optional`) boolean: - Whether to lowercase. - - Returns: - Normalizer + def normalize_str(self, sequence): + """ + Normalize the given str """ pass -class NFD(Normalizer): - """ NFD Unicode Normalizer """ +class BertNormalizer(Normalizer): + """ + BertNormalizer - def __init__(self) -> None: - """ Instantiate a new NFD Normalizer """ - pass + Takes care of normalizing raw text before giving it to a Bert model. + This includes cleaning the text, handling accents, chinese chars and lowercasing -class NFKD(Normalizer): - """ NFKD Unicode Normalizer """ + Args: + clean_text: (`optional`) boolean: + Whether to clean the text, by removing any control characters + and replacing all whitespaces by the classic one. - def __init__(self) -> None: - """ Instantiate a new NFKD Normalizer """ - pass + handle_chinese_chars: (`optional`) boolean: + Whether to handle chinese chars by putting spaces around them. -class NFC(Normalizer): - """ NFC Unicode Normalizer """ + strip_accents: (`optional`) boolean: + Whether to strip all accents. If this option is not specified (ie == None), + then it will be determined by the value for `lowercase` (as in the original Bert). - def __init__(self) -> None: - """ Instantiate a new NFC Normalizer """ - pass + lowercase: (`optional`) boolean: + Whether to lowercase. -class NFKC(Normalizer): - """ NFKC Unicode Normalizer """ - - def __init__(self) -> None: - """ Instantiate a new NFKC Normalizer """ - pass - -class Sequence(Normalizer): - """Allows concatenating multiple other Normalizer as a Sequence. - - All the normalizers run in sequence in the given order + Returns: + Normalizer """ - def __init__(self, normalizers: List[Normalizer]) -> None: - """Instantiate a new normalization Sequence using the given normalizers - - Args: - normalizers: List[Normalizer]: - A list of Normalizer to be run as a sequence + def __init__( + self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True + ): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str """ pass class Lowercase(Normalizer): - """ Lowercase Normalizer """ + """ + Lowercase Normalizer + """ - def __init__(self) -> None: - """ Instantiate a new Lowercase Normalizer """ + def __init__(self): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ pass -class Strip(Normalizer): - """ Strip normalizer """ +class NFC(Normalizer): + """ + NFC Unicode Normalizer + """ - def __init__(self, left: bool = True, right: bool = True) -> Normalizer: + def __init__(self): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ pass -class StripAccents(Normalizer): - """ StripAccents normalizer """ +class NFD(Normalizer): + """ + NFD Unicode Normalizer + """ - def __init__(self) -> Normalizer: + def __init__(self): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ + pass + +class NFKC(Normalizer): + """ + NFKC Unicode Normalizer + """ + + def __init__(self): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ + pass + +class NFKD(Normalizer): + """ + NFKD Unicode Normalizer + """ + + def __init__(self): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ pass class Nmt(Normalizer): - """ Nmt normalizer """ + """ + Nmt normalizer + """ - def __init__(self) -> Normalizer: + def __init__(self): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ pass class Precompiled(Normalizer): - """ Precompiled normalizer """ + """ + Precompiled normalizer + Don't use manually it is used for compatiblity for SentencePiece. + """ - def __init__(self, precompiled_charsmap: bytes) -> Normalizer: + def __init__(self, precompiled_charsmap): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ pass class Replace(Normalizer): - """ Replace normalizer """ + """ + Replace normalizer + """ - def __init__(self, pattern: str, content: str) -> Normalizer: + def __init__(self, pattern, content): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ pass -def unicode_normalizer_from_str(normalizer: str) -> Normalizer: +class Sequence(Normalizer): """ - Instanciate unicode normalizer from the normalizer name - :param normalizer: Name of the normalizer - :return: + Allows concatenating multiple other Normalizer as a Sequence. + All the normalizers run in sequence in the given order + + Args: + normalizers: List[Normalizer]: + A list of Normalizer to be run as a sequence """ - pass + + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ + pass + +class Strip(Normalizer): + """ + Strip normalizer + """ + + def __init__(self, left=True, right=True): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ + pass + +class StripAccents(Normalizer): + def __init__(self): + pass + def normalize(self, normalized): + """ + Normalize the given NormalizedString in-place + """ + pass + def normalize_str(self, sequence): + """ + Normalize the given str + """ + pass diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py index 2a4e22a1..10a5eb1d 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py @@ -1,13 +1,14 @@ +# Generated content DO NOT EDIT from .. import pre_tokenizers PreTokenizer = pre_tokenizers.PreTokenizer -ByteLevel = pre_tokenizers.ByteLevel -Whitespace = pre_tokenizers.Whitespace -Punctuation = pre_tokenizers.Punctuation -Sequence = pre_tokenizers.Sequence -WhitespaceSplit = pre_tokenizers.WhitespaceSplit BertPreTokenizer = pre_tokenizers.BertPreTokenizer -Metaspace = pre_tokenizers.Metaspace +ByteLevel = pre_tokenizers.ByteLevel CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit Digits = pre_tokenizers.Digits +Metaspace = pre_tokenizers.Metaspace +Punctuation = pre_tokenizers.Punctuation +Sequence = pre_tokenizers.Sequence UnicodeScripts = pre_tokenizers.UnicodeScripts +Whitespace = pre_tokenizers.Whitespace +WhitespaceSplit = pre_tokenizers.WhitespaceSplit diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index 931e2d3a..bda0ab33 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -1,163 +1,242 @@ -from .. import PreTokenizedString -from typing import Optional, List, Tuple - -Offsets = Tuple[int, int] - +# Generated content DO NOT EDIT class PreTokenizer: - """Base class for all pre-tokenizers + """ + Base class for all pre-tokenizers This class is not supposed to be instantiated directly. Instead, any implementation of a PreTokenizer will return an instance of this class when instantiated. """ - def pre_tokenize(self, pretokenized: PreTokenizedString): - """ Pre tokenize the given PreTokenizedString in-place """ + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ pass - def pre_tokenize_str(self, sequence: str) -> List[Tuple[str, Offsets]]: - """ Pre tokenize the given sequence """ + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ + pass + +class BertPreTokenizer(PreTokenizer): + """ + BertPreTokenizer + + This pre-tokenizer splits tokens on spaces, and also on punctuation. + Each occurence of a punctuation character will be treated separately. + """ + + def __init__(self): + pass + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ pass class ByteLevel(PreTokenizer): - """ByteLevel PreTokenizer + """ + ByteLevel PreTokenizer This pre-tokenizer takes care of replacing all bytes of the given string with a corresponding representation, as well as splitting into words. + + Args: + add_prefix_space: (`optional`) boolean: + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. + Returns: + PreTokenizer """ - def __init__(self, add_prefix_space: bool = True) -> None: - """Instantiate a new ByteLevel PreTokenizer - Args: - add_prefix_space: (`optional`) boolean: - Whether to add a space to the first word if there isn't already one. This - lets us treat `hello` exactly like `say hello`. - Returns: - PreTokenizer - """ + def __init__(self, add_prefix_space=True): pass @staticmethod - def alphabet() -> List[str]: - """Returns the alphabet used by this PreTokenizer. + def alphabet(): + """ + Returns the alphabet used by this PreTokenizer. Since the ByteLevel works as its name suggests, at the byte level, it encodes any byte to one visible character. This means that there is a total of 256 different characters composing this alphabet. """ pass - -class Whitespace(PreTokenizer): - """Whitespace PreTokenizer - - This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` - """ - - def __init__(self) -> None: - """ Instantiate a new Whitespace PreTokenizer """ + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ pass - -class WhitespaceSplit(PreTokenizer): - """Whitespace PreTokenizer - - This pre-tokenizer simply splits on the whitespace. Works like `.split()` - """ - - def __init__(self) -> None: - """ Instantiate a new WhitespaceSplit PreTokenizer """ - pass - -class BertPreTokenizer(PreTokenizer): - """BertPreTokenizer - - This pre-tokenizer splits tokens on spaces, and also on punctuation. - Each occurence of a punctuation character will be treated separately. - """ - - def __init__(self) -> None: - """ Instantiate a new BertPreTokenizer """ - pass - -class Metaspace(PreTokenizer): - """Metaspace pre-tokenizer - - This pre-tokenizer replaces any whitespace by the provided replacement character. - It then tries to split on these spaces. - """ - - def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None: - """Instantiate a new Metaspace - - Args: - replacement: str: - The replacement character. Must be exactly one character. By default we - use the `▁` (U+2581) meta symbol (Same as in SentencePiece). - - add_prefix_space: boolean: - Whether to add a space to the first word if there isn't already one. This - lets us treat `hello` exactly like `say hello`. + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence """ pass class CharDelimiterSplit(PreTokenizer): - """CharDelimiterSplit PreTokenizer - + """ This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)` + + Args: + delimiter: str: + The delimiter char that will be used to split input """ - @staticmethod - def __init__(self, delimiter: str) -> None: - """Instantiate a new CharDelimiterSplit PreTokenizer + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ + pass - Args: - delimiter: str: - The delimiter char that will be used to split input +class Digits(PreTokenizer): + """ + This pre-tokenizer simply splits using the digits in separate tokens + Args: + individual_digits: bool: + If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please" + If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please" + """ + + def __init__(self, individual_digits=False): + pass + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ + pass + +class Metaspace(PreTokenizer): + """ + Metaspace pre-tokenizer + + This pre-tokenizer replaces any whitespace by the provided replacement character. + It then tries to split on these spaces. + Args: + replacement: str: + The replacement character. Must be exactly one character. By default we + use the `▁` (U+2581) meta symbol (Same as in SentencePiece). + + add_prefix_space: boolean: + Whether to add a space to the first word if there isn't already one. This + lets us treat `hello` exactly like `say hello`. + """ + + def __init__(self, replacement="▁", add_prefix_space=True): + pass + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence """ pass class Punctuation(PreTokenizer): - """Punctuation PreTokenizer - + """ This pre-tokenizer simply splits on punctuation as individual characters.` """ - def __init__(self) -> None: - """ Instantiate a new Punctuation PreTokenizer """ + def __init__(self): + pass + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ pass class Sequence(PreTokenizer): - """Sequence PreTokenizer - - This pre-tokenizer composes other pre_tokenizers and applies them in sequence` + """ + This pre-tokenizer composes other pre_tokenizers and applies them in sequence """ - def __init__(self) -> None: - """ Instantiate a new Sequence PreTokenizer """ + def __init__(self, pretokenizers): pass - -class Digits(PreTokenizer): - """Digits PreTokenizer - - This pre-tokenizer simply splits using the digits in separate tokens - """ - - def __init__(self, individual_digits: bool) -> None: - """Instantiate a new Digits - - Args: - individual_digits: bool: - If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please" - If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please" - + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence """ pass class UnicodeScripts(PreTokenizer): - """UnicodeScripts PreTokenizer - + """ This pre-tokenizer splits on characters that belong to different language family It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. This mimicks SentencePiece Unigram implementation. """ - def __init__(self) -> None: - """ Instantiate a new UnicodeScripts """ + def __init__(self): + pass + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ + pass + +class Whitespace(PreTokenizer): + """ + This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` + """ + + def __init__(self): + pass + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ + pass + +class WhitespaceSplit(PreTokenizer): + """ + This pre-tokenizer simply splits on the whitespace. Works like `.split()` + """ + + def __init__(self): + pass + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ pass diff --git a/bindings/python/py_src/tokenizers/processors/__init__.py b/bindings/python/py_src/tokenizers/processors/__init__.py index ec00a6a0..a9897c46 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.py +++ b/bindings/python/py_src/tokenizers/processors/__init__.py @@ -1,7 +1,8 @@ +# Generated content DO NOT EDIT from .. import processors PostProcessor = processors.PostProcessor BertProcessing = processors.BertProcessing -RobertaProcessing = processors.RobertaProcessing ByteLevel = processors.ByteLevel +RobertaProcessing = processors.RobertaProcessing TemplateProcessing = processors.TemplateProcessing diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index 77edae46..d31167a9 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -1,53 +1,85 @@ -from .. import Encoding -from typing import Tuple, Union, List - +# Generated content DO NOT EDIT class PostProcessor: - """Base class for all post-processors + """ + Base class for all post-processors This class is not supposed to be instantiated directly. Instead, any implementation of a PostProcessor will return an instance of this class when instantiated. """ - def num_special_tokens_to_add(self, is_pair: bool) -> int: + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. :param is_pair: Boolean indicating if the input would be a single sentence or a pair :return: """ pass - def process( - self, encoding: Encoding, pair: Optional[Encoding] = None, add_special_tokens: bool = True - ) -> Encoding: - """ Post-process the given encodings, generating the final one """ + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one + """ pass class BertProcessing(PostProcessor): - """BertProcessing - + """ This post-processor takes care of adding the special tokens needed by a Bert model: - a SEP token - a CLS token + Args: + sep: Tuple[str, int]: + A tuple with the string representation of the SEP token, and its id + + cls: Tuple[str, int]: + A tuple with the string representation of the CLS token, and its id + + Returns: + PostProcessor """ - def __init__(self, sep: Tuple[str, int], cls: Tuple[str, int]) -> None: - """Instantiate a new BertProcessing with the given tokens + def __init__(self, sep, cls): + pass + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + :param is_pair: Boolean indicating if the input would be a single sentence or a pair + :return: + """ + pass + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one + """ + pass - Args: - sep: Tuple[str, int]: - A tuple with the string representation of the SEP token, and its id +class ByteLevel(PostProcessor): + """ + This post-processor takes care of trimming the offsets. + By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't + want the offsets to include these whitespaces, then this PostProcessor must be used. - cls: Tuple[str, int]: - A tuple with the string representation of the CLS token, and its id + Args: + trim_offsets: bool: + Whether to trim the whitespaces from the produced offsets. + """ - Returns: - PostProcessor + def __init__(self, trim_offsets=True): + pass + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + :param is_pair: Boolean indicating if the input would be a single sentence or a pair + :return: + """ + pass + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one """ pass class RobertaProcessing(PostProcessor): - """RobertaProcessing - + """ This post-processor takes care of adding the special tokens needed by a Roberta model: - a SEP token @@ -57,59 +89,41 @@ class RobertaProcessing(PostProcessor): By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't want the offsets to include these whitespaces, then this PostProcessor should be initialized with `trim_offsets=True` + Args: + sep: Tuple[str, int]: + A tuple with the string representation of the SEP token, and its id + + cls: Tuple[str, int]: + A tuple with the string representation of the CLS token, and its id + + trim_offsets: bool: + Whether to trim the whitespaces from the produced offsets. + + add_prefix_space: bool: + Whether the add_prefix_space option was enabled during pre-tokenization. This + is relevant because it defines the way the offsets are trimmed out. + + Returns: + PostProcessor """ - def __init__( - self, - sep: Tuple[str, int], - cls: Tuple[str, int], - trim_offsets: bool = True, - add_prefix_space: bool = True, - ) -> None: - """Instantiate a new RobertaProcessing with the given tokens - - Args: - sep: Tuple[str, int]: - A tuple with the string representation of the SEP token, and its id - - cls: Tuple[str, int]: - A tuple with the string representation of the CLS token, and its id - - trim_offsets: bool: - Whether to trim the whitespaces from the produced offsets. - - add_prefix_space: bool: - Whether the add_prefix_space option was enabled during pre-tokenization. This - is relevant because it defines the way the offsets are trimmed out. - - Returns: - PostProcessor + def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True): + pass + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + :param is_pair: Boolean indicating if the input would be a single sentence or a pair + :return: """ pass - -class ByteLevel(PostProcessor): - """ByteLevel Post processing - - This post-processor takes care of trimming the offsets. - By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't - want the offsets to include these whitespaces, then this PostProcessor must be used. - """ - - def __init__(self, trim_offsets: bool = True) -> None: - """Instantiate a new ByteLevel - - Args: - trim_offsets: bool: - Whether to trim the whitespaces from the produced offsets. + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one """ pass -Template = Union[str, List[str]] -Tokens = List[Union[Tuple[int, str], Tuple[str, int], dict]] - class TemplateProcessing(PostProcessor): - """TemplateProcessing - + """ Provides a way to specify templates in order to add the special tokens to each input sequence as relevant. @@ -147,32 +161,42 @@ class TemplateProcessing(PostProcessor): will be added to the Encoding without any further check. If the given ids correspond to something totally different in a `Tokenizer` using this `PostProcessor`, it might lead to unexpected results. + + Args: + single: Template + The template used for single sequences + + pair: Template: + The template used when both sequences are specified + + special_tokens: Tokens: + The list of special tokens used in each sequences + + Template: Union[str, List[str]]: + - If a `str` is provided, the whitespace is used as delimiter between tokens + - If a `List[str]` is provided, a list of tokens + + Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]: + - A Tuple with both a token and its associated ID, in any order + - A dict with the following keys: + - "id": str => The special token id, as specified in the Template + - "ids": List[int] => The associated IDs + - "tokens": List[str] => The associated tokens + The given dict expects the provided `ids` and `tokens` lists to have + the same length. """ - def __init__(self, single: Template, pair: Template, special_tokens: Tokens) -> None: - """Instantiate a new TemplateProcessing - - Args: - single: Template - The template used for single sequences - - pair: Template: - The template used when both sequences are specified - - special_tokens: Tokens: - The list of special tokens used in each sequences - - Template: Union[str, List[str]]: - - If a `str` is provided, the whitespace is used as delimiter between tokens - - If a `List[str]` is provided, a list of tokens - - Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]: - - A Tuple with both a token and its associated ID, in any order - - A dict with the following keys: - - "id": str => The special token id, as specified in the Template - - "ids": List[int] => The associated IDs - - "tokens": List[str] => The associated tokens - The given dict expects the provided `ids` and `tokens` lists to have - the same length. + def __init__(self, single, pair, special_tokens): + pass + def num_special_tokens_to_add(self, is_pair): + """ + Return the number of special tokens that would be added for single/pair sentences. + :param is_pair: Boolean indicating if the input would be a single sentence or a pair + :return: + """ + pass + def process(self, encoding, pair=None, add_special_tokens=True): + """ + Post-process the given encodings, generating the final one """ pass diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.py b/bindings/python/py_src/tokenizers/trainers/__init__.py index 9ddbff2b..05243aa5 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.py +++ b/bindings/python/py_src/tokenizers/trainers/__init__.py @@ -1,6 +1,7 @@ +# Generated content DO NOT EDIT from .. import trainers Trainer = trainers.Trainer BpeTrainer = trainers.BpeTrainer -WordPieceTrainer = trainers.WordPieceTrainer UnigramTrainer = trainers.UnigramTrainer +WordPieceTrainer = trainers.WordPieceTrainer diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index e1ddd6f7..6d0ad304 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -1,148 +1,132 @@ -from .. import AddedToken -from typing import Optional, List, Union - +# Generated content DO NOT EDIT class Trainer: - """Base class for all trainers + """ + Base class for all trainers This class is not supposed to be instantiated directly. Instead, any implementation of a Trainer will return an instance of this class when instantiated. + + Args: + vocab_size: unsigned int: + The size of the final vocabulary, including all tokens and alphabet. + + min_frequency: unsigned int: + The minimum frequency a pair should have in order to be merged. + + show_progress: boolean: + Whether to show progress bars while training. + + special_tokens: List[Union[str, AddedToken]]: + A list of special tokens the model should know of. + + limit_alphabet: unsigned int: + The maximum different characters to keep in the alphabet. + + initial_alphabet: List[str]: + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + + continuing_subword_prefix: Optional[str]: + A prefix to be used for every subword that is not a beginning-of-word. + + end_of_word_suffix: Optional[str]: + A suffix to be used for every subword that is a end-of-word. + + Returns: + Trainer """ -class BpeTrainer(Trainer): - """BpeTrainer + def __init__( + self, + vocab_size=30000, + min_frequency=0, + show_progress=True, + special_tokens=[], + limit_alphabet=None, + initial_alphabet=[], + continuing_subword_prefix=None, + end_of_word_suffix=None, + ): + pass +class BpeTrainer(Trainer): + """ Capable of training a BPE model """ - def __init__( - self, - vocab_size: int = 30000, - min_frequency: int = 0, - show_progress: bool = True, - special_tokens: List[Union[str, AddedToken]] = [], - limit_alphabet: Optional[int] = None, - initial_alphabet: List[str] = [], - continuing_subword_prefix: Optional[str] = None, - end_of_word_suffix: Optional[str] = None, - ) -> None: - """Instantiate a new BpeTrainer with the given options: +class UnigramTrainer(Trainer): + """ + Capable of training a Unigram model - Args: - vocab_size: unsigned int: - The size of the final vocabulary, including all tokens and alphabet. + Args: + vocab_size: unsigned int: + The size of the final vocabulary, including all tokens and alphabet. - min_frequency: unsigned int: - The minimum frequency a pair should have in order to be merged. + show_progress: boolean: + Whether to show progress bars while training. - show_progress: boolean: - Whether to show progress bars while training. + special_tokens: List[Union[str, AddedToken]]: + A list of special tokens the model should know of. - special_tokens: List[Union[str, AddedToken]]: - A list of special tokens the model should know of. + initial_alphabet: List[str]: + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. - limit_alphabet: unsigned int: - The maximum different characters to keep in the alphabet. + Returns: + Trainer + """ - initial_alphabet: List[str]: - A list of characters to include in the initial alphabet, even - if not seen in the training dataset. - If the strings contain more than one character, only the first one - is kept. - - continuing_subword_prefix: Optional[str]: - A prefix to be used for every subword that is not a beginning-of-word. - - end_of_word_suffix: Optional[str]: - A suffix to be used for every subword that is a end-of-word. - - Returns: - Trainer - """ + def __init__(self, vocab_size=8000, show_progress=True, special_tokens=[]): pass class WordPieceTrainer(Trainer): - """WordPieceTrainer - + """ Capable of training a WordPiece model + Args: + vocab_size: unsigned int: + The size of the final vocabulary, including all tokens and alphabet. + + min_frequency: unsigned int: + The minimum frequency a pair should have in order to be merged. + + show_progress: boolean: + Whether to show progress bars while training. + + special_tokens: List[Union[str, AddedToken]]: + A list of special tokens the model should know of. + + limit_alphabet: unsigned int: + The maximum different characters to keep in the alphabet. + + initial_alphabet: List[str]: + A list of characters to include in the initial alphabet, even + if not seen in the training dataset. + If the strings contain more than one character, only the first one + is kept. + + continuing_subword_prefix: Optional[str]: + A prefix to be used for every subword that is not a beginning-of-word. + + end_of_word_suffix: Optional[str]: + A suffix to be used for every subword that is a end-of-word. + + Returns: + Trainer """ def __init__( self, - vocab_size: int = 30000, - min_frequency: int = 0, - show_progress: bool = True, - special_tokens: List[Union[str, AddedToken]] = [], - limit_alphabet: Optional[int] = None, - initial_alphabet: List[str] = [], - continuing_subword_prefix: Optional[str] = "##", - end_of_word_suffix: Optional[str] = None, - ) -> Trainer: - """Instantiate a new WordPieceTrainer with the given options: - - Args: - vocab_size: unsigned int: - The size of the final vocabulary, including all tokens and alphabet. - - min_frequency: unsigned int: - The minimum frequency a pair should have in order to be merged. - - show_progress: boolean: - Whether to show progress bars while training. - - special_tokens: List[Union[str, AddedToken]]: - A list of special tokens the model should know of. - - limit_alphabet: unsigned int: - The maximum different characters to keep in the alphabet. - - initial_alphabet: List[str]: - A list of characters to include in the initial alphabet, even - if not seen in the training dataset. - If the strings contain more than one character, only the first one - is kept. - - continuing_subword_prefix: Optional[str]: - A prefix to be used for every subword that is not a beginning-of-word. - - end_of_word_suffix: Optional[str]: - A suffix to be used for every subword that is a end-of-word. - - Returns: - Trainer - """ - pass - -class UnigramTrainer(Trainer): - """UnigramTrainer - - Capable of training a Unigram model - """ - - def __init__( - self, - vocab_size: int = 8000, - show_progress: bool = True, - special_tokens: List[Union[str, AddedToken]] = [], - ) -> Trainer: - """Instantiate a new UnigramTrainer with the given options: - - Args: - vocab_size: unsigned int: - The size of the final vocabulary, including all tokens and alphabet. - - show_progress: boolean: - Whether to show progress bars while training. - - special_tokens: List[Union[str, AddedToken]]: - A list of special tokens the model should know of. - - initial_alphabet: List[str]: - A list of characters to include in the initial alphabet, even - if not seen in the training dataset. - If the strings contain more than one character, only the first one - is kept. - - Returns: - Trainer - """ + vocab_size=30000, + min_frequency=0, + show_progress=True, + special_tokens=[], + limit_alphabet=None, + initial_alphabet=[], + continuing_subword_prefix="##", + end_of_word_suffix=None, + ): pass diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 46a4981c..e053db10 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -15,6 +15,10 @@ use tokenizers as tk; use super::error::ToPyResult; +/// Base class for all decoders +/// +/// This class is not supposed to be instantiated directly. Instead, any implementation of +/// a Decoder will return an instance of this class when instantiated. #[pyclass(dict, module = "tokenizers.decoders", name=Decoder)] #[derive(Clone, Deserialize, Serialize)] pub struct PyDecoder { @@ -82,12 +86,16 @@ impl PyDecoder { } } + /// Decode the given list of string to a final string + #[text_signature = "(self, tokens)"] fn decode(&self, tokens: Vec) -> PyResult { ToPyResult(self.decoder.decode(tokens)).into() } } +/// ByteLevel Decoder #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=ByteLevel)] +#[text_signature = "(self)"] pub struct PyByteLevelDec {} #[pymethods] impl PyByteLevelDec { @@ -97,7 +105,16 @@ impl PyByteLevelDec { } } +/// Instantiate a new WordPiece Decoder +/// +/// Args: +/// prefix: str: +/// The prefix to use for subwords that are not a beginning-of-word +/// cleanup: bool: +/// Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, +/// and some abbreviated english forms. #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=WordPiece)] +#[text_signature = "(self, prefix=\"##\", cleanup=True)"] pub struct PyWordPieceDec {} #[pymethods] impl PyWordPieceDec { @@ -120,7 +137,18 @@ impl PyWordPieceDec { } } +/// Instantiate a new Metaspace +/// +/// Args: +/// replacement: str: +/// The replacement character. Must be exactly one character. By default we +/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece). +/// +/// add_prefix_space: boolean: +/// Whether to add a space to the first word if there isn't already one. This +/// lets us treat `hello` exactly like `say hello`. #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=Metaspace)] +#[text_signature = "(self, replacement = \"▁\", add_prefix_space = True)"] pub struct PyMetaspaceDec {} #[pymethods] impl PyMetaspaceDec { @@ -153,7 +181,14 @@ impl PyMetaspaceDec { } } +/// Instantiate a new BPEDecoder +/// +/// Args: +/// suffix: str: +/// The suffix that was used to caracterize an end-of-word. This suffix will +/// be replaced by whitespaces during the decoding #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name=BPEDecoder)] +#[text_signature = "(self, suffix=\"\")"] pub struct PyBPEDecoder {} #[pymethods] impl PyBPEDecoder { diff --git a/bindings/python/src/encoding.rs b/bindings/python/src/encoding.rs index dc9ebab9..fec483b1 100644 --- a/bindings/python/src/encoding.rs +++ b/bindings/python/src/encoding.rs @@ -107,7 +107,7 @@ impl PyEncoding { /// /// Set the given sequence index for the whole range of tokens contained in this /// :class:`~tokenizers.Encoding`. - #[text_signature = "($self, sequence_id)"] + #[text_signature = "(self, sequence_id)"] fn set_sequence_id(&mut self, sequence_id: usize) { self.encoding.set_sequence_id(sequence_id); } @@ -269,7 +269,7 @@ impl PyEncoding { /// Returns: /// :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` #[args(sequence_index = 0)] - #[text_signature = "($self, word_index, sequence_index=0)"] + #[text_signature = "(self, word_index, sequence_index=0)"] fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> { self.encoding.word_to_tokens(word_index, sequence_index) } @@ -285,7 +285,7 @@ impl PyEncoding { /// Returns: /// :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` #[args(sequence_index = 0)] - #[text_signature = "($self, word_index, sequence_index=0)"] + #[text_signature = "(self, word_index, sequence_index=0)"] fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option { self.encoding.word_to_chars(word_index, sequence_index) } @@ -301,7 +301,7 @@ impl PyEncoding { /// /// Returns: /// :obj:`int`: The sequence id of the given token - #[text_signature = "($self, token_index)"] + #[text_signature = "(self, token_index)"] fn token_to_sequence(&self, token_index: usize) -> Option { self.encoding.token_to_sequence(token_index) } @@ -318,7 +318,7 @@ impl PyEncoding { /// /// Returns: /// :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` - #[text_signature = "($self, token_index)"] + #[text_signature = "(self, token_index)"] fn token_to_chars(&self, token_index: usize) -> Option { let (_, offsets) = self.encoding.token_to_chars(token_index)?; Some(offsets) @@ -336,7 +336,7 @@ impl PyEncoding { /// /// Returns: /// :obj:`int`: The index of the word in the relevant input sequence. - #[text_signature = "($self, token_index)"] + #[text_signature = "(self, token_index)"] fn token_to_word(&self, token_index: usize) -> Option { let (_, word_idx) = self.encoding.token_to_word(token_index)?; Some(word_idx) @@ -353,7 +353,7 @@ impl PyEncoding { /// Returns: /// :obj:`int`: The index of the token that contains this char in the encoded sequence #[args(sequence_index = 0)] - #[text_signature = "($self, char_pos, sequence_index=0)"] + #[text_signature = "(self, char_pos, sequence_index=0)"] fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option { self.encoding.char_to_token(char_pos, sequence_index) } @@ -369,7 +369,7 @@ impl PyEncoding { /// Returns: /// :obj:`int`: The index of the word that contains this char in the input sequence #[args(sequence_index = 0)] - #[text_signature = "($self, char_pos, sequence_index=0)"] + #[text_signature = "(self, char_pos, sequence_index=0)"] fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option { self.encoding.char_to_word(char_pos, sequence_index) } @@ -392,7 +392,7 @@ impl PyEncoding { /// pad_token (:obj:`str`, defaults to `[PAD]`): /// The pad token to use #[args(kwargs = "**")] - #[text_signature = "($self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"] + #[text_signature = "(self, length, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]')"] fn pad(&mut self, length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { let mut pad_id = 0; let mut pad_type_id = 0; @@ -440,7 +440,7 @@ impl PyEncoding { /// stride (:obj:`int`, defaults to :obj:`0`): /// The length of previous content to be included in each overflowing piece #[args(stride = "0")] - #[text_signature = "($self, max_length, stride=0)"] + #[text_signature = "(self, max_length, stride=0)"] fn truncate(&mut self, max_length: usize, stride: usize) -> PyResult<()> { self.encoding.truncate(max_length, stride); Ok(()) diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 3c196946..b38e3033 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -106,6 +106,8 @@ impl PyModel { } } + /// Tokenize the given sequence + #[text_signature = "(self, tokens)"] fn tokenize(&self, tokens: &str) -> PyResult> { Ok(ToPyResult(self.model.tokenize(tokens)) .into_py()? @@ -114,14 +116,24 @@ impl PyModel { .collect()) } + /// Returns the id associated with the given token + #[text_signature = "(self, tokens)"] fn token_to_id(&self, token: &str) -> Option { self.model.token_to_id(token) } + /// Returns the token associated with the given id + #[text_signature = "(self, id)"] fn id_to_token(&self, id: u32) -> Option<&str> { self.model.id_to_token(id) } + /// Save the current model + /// + /// Save the current model in the given folder, using the given name for the various + /// files that will get created. + /// Any file with the same name that already exist in this folder will be overwritten. + #[text_signature = "(self, folder, name)"] fn save(&self, folder: &str, name: Option<&str>) -> PyResult> { let saved: PyResult> = ToPyResult(self.model.save(Path::new(folder), name)).into(); @@ -132,9 +144,36 @@ impl PyModel { } } -/// BPE Model -/// Allows the creation of a BPE Model to be used with a Tokenizer +/// Instantiate a BPE Model from the given vocab and merges. +/// +/// Args: +/// vocab: ('`optional`) Dict[str, int]: +/// A dictionnary of string keys and their ids {"am": 0,...} +/// +/// merges: (`optional`) string: +/// A list of pairs of tokens [("a", "b"),...] +/// +/// cache_capacity: (`optional`) int: +/// The number of words that the BPE cache can contain. The cache allows +/// to speed-up the process by keeping the result of the merge operations +/// for a number of words. +/// +/// dropout: (`optional`) Optional[float] [0, 1]: +/// The BPE dropout to use. Must be an float between 0 and 1 +/// +/// unk_token: (`optional`) str: +/// The unknown token to be used by the model. +/// +/// continuing_subword_prefix: (`optional`) str: +/// The prefix to attach to subword units that don't represent a beginning of word. +/// +/// end_of_word_suffix: (`optional`) str: +/// The suffix to attach to subword units that represent an end of word. +/// +/// fuse_unk: (`optional`) bool: +/// Multiple unk tokens get fused into only 1 #[pyclass(extends=PyModel, module = "tokenizers.models", name=BPE)] +#[text_signature = "(self, vocab=None, merges=None, cache_capacity=None, dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=None)"] pub struct PyBPE {} impl PyBPE { @@ -225,7 +264,9 @@ impl PyBPE { PyBPE::with_builder(builder, kwargs) } + /// Read a vocab_filename and merge_filename and stores result in memory #[staticmethod] + #[text_signature = "(self, vocab_filename, merges_filename)"] fn read_file(vocab_filename: &str, merges_filename: &str) -> PyResult<(Vocab, Merges)> { BPE::read_file(vocab_filename, merges_filename).map_err(|e| { exceptions::PyValueError::new_err(format!( @@ -235,8 +276,15 @@ impl PyBPE { }) } + /// Convenient method to intialize a BPE from files + /// Roughly equivalent to + /// + /// def from_file(vocab_filename, merges_filenames, **kwargs): + /// vocab, merges = BPE.read_file(vocab_filename, merges_filename) + /// return BPE(vocab, merges, **kwargs) #[staticmethod] #[args(kwargs = "**")] + #[text_signature = "(vocab_filename, merge_filename, **kwargs)"] fn from_file( py: Python, vocab_filename: &str, @@ -257,8 +305,20 @@ impl PyBPE { } } -/// WordPiece Model +/// WordPiece model +/// Instantiate a WordPiece Model from the given vocab file. +/// +/// Args: +/// vocab: (`optional`) string: +/// A dictionnary of string keys and their ids {"am": 0,...} +/// +/// unk_token: (`optional`) str: +/// The unknown token to be used by the model. +/// +/// max_input_chars_per_word: (`optional`) int: +/// The maximum number of characters to authorize in a single word. #[pyclass(extends=PyModel, module = "tokenizers.models", name=WordPiece)] +#[text_signature = "(self, vocab, unk_token, max_input_chars_per_word)"] pub struct PyWordPiece {} impl PyWordPiece { @@ -319,15 +379,24 @@ impl PyWordPiece { PyWordPiece::with_builder(builder, kwargs) } + /// Read a vocab_filename and stores result in memory #[staticmethod] + #[text_signature = "(vocab_filename)"] fn read_file(vocab_filename: &str) -> PyResult { WordPiece::read_file(vocab_filename).map_err(|e| { exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e)) }) } + /// Convenient method to intialize a WordPiece from files + /// Roughly equivalent to + /// + /// def from_file(vocab_filename, **kwargs): + /// vocab = WordPiece.read_file(vocab_filename) + /// return WordPiece(vocab, **kwargs) #[staticmethod] #[args(kwargs = "**")] + #[text_signature = "(vocab_filename, merge_filename, **kwargs)"] fn from_file(py: Python, vocab_filename: &str, kwargs: Option<&PyDict>) -> PyResult> { let vocab = WordPiece::read_file(vocab_filename).map_err(|e| { exceptions::PyValueError::new_err(format!("Error while reading WordPiece file: {}", e)) @@ -336,7 +405,18 @@ impl PyWordPiece { } } +/// Most simple tokenizer model based on mapping token from a vocab file to their corresponding id. +/// +/// Instantiate a WordLevel Model from the given vocab file. +/// +/// Args: +/// vocab: (`optional`) string: +/// A dictionnary of string keys and their ids {"am": 0,...} +/// +/// unk_token: str: +/// The unknown token to be used by the model. #[pyclass(extends=PyModel, module = "tokenizers.models", name=WordLevel)] +#[text_signature = "(self, vocab, unk_token)"] pub struct PyWordLevel {} impl PyWordLevel { @@ -411,7 +491,16 @@ impl PyWordLevel { } } +/// UnigramEncoding model class +/// +/// Instantiate a Unigram Model from the given model file. +/// +/// Args: +/// vocab: ('`optional`) string: +/// A list of vocabulary items and their relative score [("am", -0.2442),...] +/// #[pyclass(extends=PyModel, module = "tokenizers.models", name=Unigram)] +#[text_signature = "(self, vocab)"] pub struct PyUnigram {} #[pymethods] diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index c6fd6509..b888abbd 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -15,6 +15,10 @@ use tk::normalizers::{ use tk::{NormalizedString, Normalizer}; use tokenizers as tk; +/// Base class for all normalizers +/// +/// This class is not supposed to be instantiated directly. Instead, any implementation of a +/// Normalizer will return an instance of this class when instantiated. #[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)] #[derive(Clone, Serialize, Deserialize)] pub struct PyNormalizer { @@ -105,10 +109,14 @@ impl PyNormalizer { } } + /// Normalize the given NormalizedString in-place + #[text_signature = "(self, normalized)"] fn normalize(&self, normalized: &mut PyNormalizedString) -> PyResult<()> { ToPyResult(self.normalizer.normalize(&mut normalized.normalized)).into() } + /// Normalize the given str + #[text_signature = "(self, sequence)"] fn normalize_str(&self, sequence: &str) -> PyResult { let mut normalized = NormalizedString::from(sequence); ToPyResult(self.normalizer.normalize(&mut normalized)).into_py()?; @@ -116,7 +124,30 @@ impl PyNormalizer { } } +/// BertNormalizer +/// +/// Takes care of normalizing raw text before giving it to a Bert model. +/// This includes cleaning the text, handling accents, chinese chars and lowercasing +/// +/// Args: +/// clean_text: (`optional`) boolean: +/// Whether to clean the text, by removing any control characters +/// and replacing all whitespaces by the classic one. +/// +/// handle_chinese_chars: (`optional`) boolean: +/// Whether to handle chinese chars by putting spaces around them. +/// +/// strip_accents: (`optional`) boolean: +/// Whether to strip all accents. If this option is not specified (ie == None), +/// then it will be determined by the value for `lowercase` (as in the original Bert). +/// +/// lowercase: (`optional`) boolean: +/// Whether to lowercase. +/// +/// Returns: +/// Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=BertNormalizer)] +#[text_signature = "(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)"] pub struct PyBertNormalizer {} #[pymethods] impl PyBertNormalizer { @@ -146,7 +177,9 @@ impl PyBertNormalizer { } } +/// NFD Unicode Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFD)] +#[text_signature = "(self)"] pub struct PyNFD {} #[pymethods] impl PyNFD { @@ -156,7 +189,9 @@ impl PyNFD { } } +/// NFKD Unicode Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKD)] +#[text_signature = "(self)"] pub struct PyNFKD {} #[pymethods] impl PyNFKD { @@ -166,7 +201,9 @@ impl PyNFKD { } } +/// NFC Unicode Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFC)] +#[text_signature = "(self)"] pub struct PyNFC {} #[pymethods] impl PyNFC { @@ -176,7 +213,9 @@ impl PyNFC { } } +/// NFKC Unicode Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=NFKC)] +#[text_signature = "(self)"] pub struct PyNFKC {} #[pymethods] impl PyNFKC { @@ -186,6 +225,12 @@ impl PyNFKC { } } +/// Allows concatenating multiple other Normalizer as a Sequence. +/// All the normalizers run in sequence in the given order +/// +/// Args: +/// normalizers: List[Normalizer]: +/// A list of Normalizer to be run as a sequence #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Sequence)] pub struct PySequence {} #[pymethods] @@ -211,7 +256,9 @@ impl PySequence { } } +/// Lowercase Normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Lowercase)] +#[text_signature = "(self)"] pub struct PyLowercase {} #[pymethods] impl PyLowercase { @@ -221,7 +268,9 @@ impl PyLowercase { } } +/// Strip normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Strip)] +#[text_signature = "(self, left=True, right=True)"] pub struct PyStrip {} #[pymethods] impl PyStrip { @@ -245,6 +294,7 @@ impl PyStrip { } #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=StripAccents)] +#[text_signature = "(self)"] pub struct PyStripAccents {} #[pymethods] impl PyStripAccents { @@ -389,7 +439,9 @@ impl Normalizer for PyNormalizerWrapper { } } +/// Nmt normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Nmt)] +#[text_signature = "(self)"] pub struct PyNmt {} #[pymethods] impl PyNmt { @@ -399,7 +451,10 @@ impl PyNmt { } } +/// Precompiled normalizer +/// Don't use manually it is used for compatiblity for SentencePiece. #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Precompiled)] +#[text_signature = "(self, precompiled_charsmap)"] pub struct PyPrecompiled {} #[pymethods] impl PyPrecompiled { @@ -420,7 +475,9 @@ impl PyPrecompiled { } } +/// Replace normalizer #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name=Replace)] +#[text_signature = "(self, pattern, content)"] pub struct PyReplace {} #[pymethods] impl PyReplace { diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index fcfea906..1462419c 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -22,6 +22,10 @@ use tokenizers as tk; use super::error::ToPyResult; use super::utils::*; +/// Base class for all pre-tokenizers +/// +/// This class is not supposed to be instantiated directly. Instead, any implementation of a +/// PreTokenizer will return an instance of this class when instantiated. #[pyclass(dict, module = "tokenizers.pre_tokenizers", name=PreTokenizer)] #[derive(Clone, Serialize, Deserialize)] pub struct PyPreTokenizer { @@ -121,10 +125,14 @@ impl PyPreTokenizer { } } + /// Pre tokenize the given PreTokenizedString in-place + #[text_signature = "(self, pretok)"] fn pre_tokenize(&self, pretok: &mut PyPreTokenizedString) -> PyResult<()> { ToPyResult(self.pretok.pre_tokenize(&mut pretok.pretok)).into() } + /// Pre tokenize the given sequence + #[text_signature = "(self, sequence)"] fn pre_tokenize_str(&self, s: &str) -> PyResult> { let mut pretokenized = tk::tokenizer::PreTokenizedString::from(s); @@ -138,7 +146,19 @@ impl PyPreTokenizer { } } +/// ByteLevel PreTokenizer +/// +/// This pre-tokenizer takes care of replacing all bytes of the given string +/// with a corresponding representation, as well as splitting into words. +/// +/// Args: +/// add_prefix_space: (`optional`) boolean: +/// Whether to add a space to the first word if there isn't already one. This +/// lets us treat `hello` exactly like `say hello`. +/// Returns: +/// PreTokenizer #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=ByteLevel)] +#[text_signature = "(self, add_prefix_space=True)"] pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { @@ -161,7 +181,13 @@ impl PyByteLevel { Ok((PyByteLevel {}, byte_level.into())) } + /// Returns the alphabet used by this PreTokenizer. + /// + /// Since the ByteLevel works as its name suggests, at the byte level, it + /// encodes any byte to one visible character. This means that there is a + /// total of 256 different characters composing this alphabet. #[staticmethod] + #[text_signature = "()"] fn alphabet() -> Vec { ByteLevel::alphabet() .into_iter() @@ -170,7 +196,9 @@ impl PyByteLevel { } } +/// This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Whitespace)] +#[text_signature = "(self)"] pub struct PyWhitespace {} #[pymethods] impl PyWhitespace { @@ -180,7 +208,9 @@ impl PyWhitespace { } } +/// This pre-tokenizer simply splits on the whitespace. Works like `.split()` #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=WhitespaceSplit)] +#[text_signature = "(self)"] pub struct PyWhitespaceSplit {} #[pymethods] impl PyWhitespaceSplit { @@ -190,6 +220,11 @@ impl PyWhitespaceSplit { } } +/// This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)` +/// +/// Args: +/// delimiter: str: +/// The delimiter char that will be used to split input #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=CharDelimiterSplit)] pub struct PyCharDelimiterSplit {} #[pymethods] @@ -210,7 +245,12 @@ impl PyCharDelimiterSplit { } } +/// BertPreTokenizer +/// +/// This pre-tokenizer splits tokens on spaces, and also on punctuation. +/// Each occurence of a punctuation character will be treated separately. #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=BertPreTokenizer)] +#[text_signature = "(self)"] pub struct PyBertPreTokenizer {} #[pymethods] impl PyBertPreTokenizer { @@ -220,7 +260,9 @@ impl PyBertPreTokenizer { } } +/// This pre-tokenizer simply splits on punctuation as individual characters.` #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)] +#[text_signature = "(self)"] pub struct PyPunctuation {} #[pymethods] impl PyPunctuation { @@ -230,7 +272,9 @@ impl PyPunctuation { } } +/// This pre-tokenizer composes other pre_tokenizers and applies them in sequence #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Sequence)] +#[text_signature = "(self, pretokenizers)"] pub struct PySequence {} #[pymethods] impl PySequence { @@ -257,7 +301,20 @@ impl PySequence { } } +/// Metaspace pre-tokenizer +/// +/// This pre-tokenizer replaces any whitespace by the provided replacement character. +/// It then tries to split on these spaces. +/// Args: +/// replacement: str: +/// The replacement character. Must be exactly one character. By default we +/// use the `▁` (U+2581) meta symbol (Same as in SentencePiece). +/// +/// add_prefix_space: boolean: +/// Whether to add a space to the first word if there isn't already one. This +/// lets us treat `hello` exactly like `say hello`. #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Metaspace)] +#[text_signature = "(self, replacement=\"▁\", add_prefix_space=True)"] pub struct PyMetaspace {} #[pymethods] impl PyMetaspace { @@ -290,7 +347,13 @@ impl PyMetaspace { } } +/// This pre-tokenizer simply splits using the digits in separate tokens +/// Args: +/// individual_digits: bool: +/// If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please" +/// If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please" #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Digits)] +#[text_signature = "(self, individual_digits=False)"] pub struct PyDigits {} #[pymethods] impl PyDigits { @@ -301,7 +364,12 @@ impl PyDigits { } } +/// This pre-tokenizer splits on characters that belong to different language family +/// It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt +/// Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. +/// This mimicks SentencePiece Unigram implementation. #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=UnicodeScripts)] +#[text_signature = "(self)"] pub struct PyUnicodeScripts {} #[pymethods] impl PyUnicodeScripts { diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 2b4f203f..e5a550b3 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -16,6 +16,10 @@ use tk::processors::PostProcessorWrapper; use tk::{Encoding, PostProcessor}; use tokenizers as tk; +/// Base class for all post-processors +/// +/// This class is not supposed to be instantiated directly. Instead, any implementation of +/// a PostProcessor will return an instance of this class when instantiated. #[pyclass(dict, module = "tokenizers.processors", name=PostProcessor)] #[derive(Clone, Deserialize, Serialize)] pub struct PyPostProcessor { @@ -88,11 +92,17 @@ impl PyPostProcessor { } } + /// Return the number of special tokens that would be added for single/pair sentences. + /// :param is_pair: Boolean indicating if the input would be a single sentence or a pair + /// :return: + #[text_signature = "(self, is_pair)"] fn num_special_tokens_to_add(&self, is_pair: bool) -> usize { self.processor.added_tokens(is_pair) } + /// Post-process the given encodings, generating the final one #[args(pair = "None", add_special_tokens = "true")] + #[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"] fn process( &self, encoding: &PyEncoding, @@ -109,7 +119,21 @@ impl PyPostProcessor { } } +/// This post-processor takes care of adding the special tokens needed by +/// a Bert model: +/// - a SEP token +/// - a CLS token +/// Args: +/// sep: Tuple[str, int]: +/// A tuple with the string representation of the SEP token, and its id +/// +/// cls: Tuple[str, int]: +/// A tuple with the string representation of the CLS token, and its id +/// +/// Returns: +/// PostProcessor #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=BertProcessing)] +#[text_signature = "(self, sep, cls)"] pub struct PyBertProcessing {} #[pymethods] impl PyBertProcessing { @@ -126,7 +150,33 @@ impl PyBertProcessing { } } +/// This post-processor takes care of adding the special tokens needed by +/// a Roberta model: +/// - a SEP token +/// - a CLS token +/// +/// It also takes care of trimming the offsets. +/// By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't +/// want the offsets to include these whitespaces, then this PostProcessor should be initialized +/// with `trim_offsets=True` +/// Args: +/// sep: Tuple[str, int]: +/// A tuple with the string representation of the SEP token, and its id +/// +/// cls: Tuple[str, int]: +/// A tuple with the string representation of the CLS token, and its id +/// +/// trim_offsets: bool: +/// Whether to trim the whitespaces from the produced offsets. +/// +/// add_prefix_space: bool: +/// Whether the add_prefix_space option was enabled during pre-tokenization. This +/// is relevant because it defines the way the offsets are trimmed out. +/// +/// Returns: +/// PostProcessor #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=RobertaProcessing)] +#[text_signature = "(self, sep, cls, trim_offsets=True, add_prefix_space=True)"] pub struct PyRobertaProcessing {} #[pymethods] impl PyRobertaProcessing { @@ -152,7 +202,15 @@ impl PyRobertaProcessing { } } +/// This post-processor takes care of trimming the offsets. +/// By default, the ByteLevel BPE might include whitespaces in the produced tokens. If you don't +/// want the offsets to include these whitespaces, then this PostProcessor must be used. +/// +/// Args: +/// trim_offsets: bool: +/// Whether to trim the whitespaces from the produced offsets. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=ByteLevel)] +#[text_signature = "(self, trim_offsets=True)"] pub struct PyByteLevel {} #[pymethods] impl PyByteLevel { @@ -244,7 +302,68 @@ impl FromPyObject<'_> for PyTemplate { } } +/// Provides a way to specify templates in order to add the special tokens to each +/// input sequence as relevant. +/// +/// Let's take `BERT` tokenizer as an example. It uses two special tokens, used to +/// delimitate each sequence. `[CLS]` is always used at the beginning of the first +/// sequence, and `[SEP]` is added at the end of both the first, and the pair +/// sequences. The final result looks like this: +/// - Single sequence: `[CLS] Hello there [SEP]` +/// - Pair sequences: `[CLS] My name is Anthony [SEP] What is my name? [SEP]` +/// With the type ids as following: +/// ```markdown +/// [CLS] ... [SEP] ... [SEP] +/// 0 0 0 1 1 +/// ``` +/// +/// You can achieve such behavior using a TemplateProcessing: +/// ``` +/// TemplateProcessing( +/// single="[CLS] $0 [SEP]", +/// pair="[CLS] $A [SEP] $B:1 [SEP]:1", +/// special_tokens=[("[CLS]", 1), ("[SEP]", 0)], +/// ) +/// ``` +/// +/// In this example, each input sequence is identified using a `$` construct. This identifier +/// lets us specify each input sequence, and the type_id to use. When nothing is specified, +/// it uses the default values. Here are the different ways to specify it: +/// - Specifying the sequence, with default `type_id == 0`: `$A` or `$B` +/// - Specifying the `type_id` with default `sequence == A`: `$0`, `$1`, `$2`, ... +/// - Specifying both: `$A:0`, `$B:1`, ... +/// +/// The same construct is used for special tokens: `(:)?`. +/// +/// **Warning**: You must ensure that you are giving the correct tokens/ids as these +/// will be added to the Encoding without any further check. If the given ids correspond +/// to something totally different in a `Tokenizer` using this `PostProcessor`, it +/// might lead to unexpected results. +/// +/// Args: +/// single: Template +/// The template used for single sequences +/// +/// pair: Template: +/// The template used when both sequences are specified +/// +/// special_tokens: Tokens: +/// The list of special tokens used in each sequences +/// +/// Template: Union[str, List[str]]: +/// - If a `str` is provided, the whitespace is used as delimiter between tokens +/// - If a `List[str]` is provided, a list of tokens +/// +/// Tokens: List[Union[Tuple[int, str], Tuple[str, int], dict]]: +/// - A Tuple with both a token and its associated ID, in any order +/// - A dict with the following keys: +/// - "id": str => The special token id, as specified in the Template +/// - "ids": List[int] => The associated IDs +/// - "tokens": List[str] => The associated tokens +/// The given dict expects the provided `ids` and `tokens` lists to have +/// the same length. #[pyclass(extends=PyPostProcessor, module = "tokenizers.processors", name=TemplateProcessing)] +#[text_signature = "(self, single, pair, special_tokens)"] pub struct PyTemplateProcessing {} #[pymethods] impl PyTemplateProcessing { diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index ad68088a..df2ae32c 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -53,7 +53,7 @@ use crate::processors::PyPostProcessor; /// Yesterday"``. /// #[pyclass(dict, module = "tokenizers", name=AddedToken)] -#[text_signature = "(content, single_word=False, lstrip=False, rstrip=False, normalized=True)"] +#[text_signature = "(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True)"] pub struct PyAddedToken { pub content: String, pub is_special_token: bool, @@ -408,7 +408,7 @@ type Tokenizer = TokenizerImpl PyResult { ToPyResult(self.tokenizer.to_string(pretty)).into() } @@ -537,11 +537,15 @@ impl PyTokenizer { /// pretty (:obj:`bool`, defaults to :obj:`False`): /// Whether the JSON file should be pretty formatted. #[args(pretty = false)] - #[text_signature = "($self, pretty=False)"] + #[text_signature = "(self, pretty=False)"] fn save(&self, path: &str, pretty: bool) -> PyResult<()> { ToPyResult(self.tokenizer.save(path, pretty)).into() } + /// Return the number of special tokens that would be added for single/pair sentences. + /// :param is_pair: Boolean indicating if the input would be a single sentence or a pair + /// :return: + #[text_signature = "(self, is_pair)"] fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult { Ok(self .tokenizer @@ -558,7 +562,7 @@ impl PyTokenizer { /// Returns: /// :obj:`Dict[str, int]`: The vocabulary #[args(with_added_tokens = true)] - #[text_signature = "($self, with_added_tokens=True)"] + #[text_signature = "(self, with_added_tokens=True)"] fn get_vocab(&self, with_added_tokens: bool) -> PyResult> { Ok(self.tokenizer.get_vocab(with_added_tokens)) } @@ -572,7 +576,7 @@ impl PyTokenizer { /// Returns: /// :obj:`int`: The size of the vocabulary #[args(with_added_tokens = true)] - #[text_signature = "($self, with_added_tokens=True)"] + #[text_signature = "(self, with_added_tokens=True)"] fn get_vocab_size(&self, with_added_tokens: bool) -> PyResult { Ok(self.tokenizer.get_vocab_size(with_added_tokens)) } @@ -591,7 +595,7 @@ impl PyTokenizer { /// The strategy used to truncation. Can be one of ``longest_first``, ``only_first`` or /// ``only_second``. #[args(kwargs = "**")] - #[text_signature = "($self, max_length, stride=0, strategy='longest_first')"] + #[text_signature = "(self, max_length, stride=0, strategy='longest_first')"] fn enable_truncation(&mut self, max_length: usize, kwargs: Option<&PyDict>) -> PyResult<()> { let mut params = TruncationParams::default(); params.max_length = max_length; @@ -626,7 +630,7 @@ impl PyTokenizer { } /// Disable truncation - #[text_signature = "($self)"] + #[text_signature = "(self)"] fn no_truncation(&mut self) { self.tokenizer.with_truncation(None); } @@ -675,7 +679,7 @@ impl PyTokenizer { /// If specified, the length at which to pad. If not specified we pad using the size of /// the longest sequence in a batch. #[args(kwargs = "**")] - #[text_signature = "($self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"] + #[text_signature = "(self, direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)"] fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> { let mut params = PaddingParams::default(); @@ -733,7 +737,7 @@ impl PyTokenizer { } /// Disable padding - #[text_signature = "($self)"] + #[text_signature = "(self)"] fn no_padding(&mut self) { self.tokenizer.with_padding(None); } @@ -802,7 +806,7 @@ impl PyTokenizer { /// :class:`~tokenizers.Encoding`: The encoded result /// #[args(pair = "None", is_pretokenized = "false", add_special_tokens = "true")] - #[text_signature = "($self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"] + #[text_signature = "(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True)"] fn encode( &self, sequence: &PyAny, @@ -867,7 +871,7 @@ impl PyTokenizer { /// A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch /// #[args(is_pretokenized = "false", add_special_tokens = "true")] - #[text_signature = "($self, input, is_pretokenized=False, add_special_tokens=True)"] + #[text_signature = "(self, input, is_pretokenized=False, add_special_tokens=True)"] fn encode_batch( &self, input: Vec<&PyAny>, @@ -910,7 +914,7 @@ impl PyTokenizer { /// Returns: /// :obj:`str`: The decoded string #[args(skip_special_tokens = true)] - #[text_signature = "($self, ids, skip_special_tokens=True)"] + #[text_signature = "(self, ids, skip_special_tokens=True)"] fn decode(&self, ids: Vec, skip_special_tokens: bool) -> PyResult { ToPyResult(self.tokenizer.decode(ids, skip_special_tokens)).into() } @@ -927,7 +931,7 @@ impl PyTokenizer { /// Returns: /// :obj:`List[str]`: A list of decoded strings #[args(skip_special_tokens = true)] - #[text_signature = "($self, sequences, skip_special_tokens=True)"] + #[text_signature = "(self, sequences, skip_special_tokens=True)"] fn decode_batch( &self, sequences: Vec>, @@ -947,7 +951,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary - #[text_signature = "($self, token)"] + #[text_signature = "(self, token)"] fn token_to_id(&self, token: &str) -> Option { self.tokenizer.token_to_id(token) } @@ -960,7 +964,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary - #[text_signature = "($self, id)"] + #[text_signature = "(self, id)"] fn id_to_token(&self, id: u32) -> Option<&str> { self.tokenizer.id_to_token(id) } @@ -977,7 +981,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`int`: The number of tokens that were created in the vocabulary - #[text_signature = "($self, tokens)"] + #[text_signature = "(self, tokens)"] fn add_tokens(&mut self, tokens: &PyList) -> PyResult { let tokens = tokens .into_iter() @@ -1014,7 +1018,7 @@ impl PyTokenizer { /// /// Returns: /// :obj:`int`: The number of tokens that were created in the vocabulary - #[text_signature = "($self, tokens)"] + #[text_signature = "(self, tokens)"] fn add_special_tokens(&mut self, tokens: &PyList) -> PyResult { let tokens = tokens .into_iter() @@ -1064,7 +1068,7 @@ impl PyTokenizer { /// Returns: /// :class:`~tokenizers.Encoding`: The final post-processed encoding #[args(pair = "None", add_special_tokens = true)] - #[text_signature = "($self, encoding, pair=None, add_special_tokens=True)"] + #[text_signature = "(self, encoding, pair=None, add_special_tokens=True)"] fn post_process( &self, encoding: &PyEncoding, diff --git a/bindings/python/src/trainers.rs b/bindings/python/src/trainers.rs index 6003b49c..3db21851 100644 --- a/bindings/python/src/trainers.rs +++ b/bindings/python/src/trainers.rs @@ -11,7 +11,43 @@ use tokenizers as tk; use crate::models::PyModel; use crate::tokenizer::PyAddedToken; +/// Base class for all trainers +/// +/// This class is not supposed to be instantiated directly. Instead, any implementation of a +/// Trainer will return an instance of this class when instantiated. +/// +/// Args: +/// vocab_size: unsigned int: +/// The size of the final vocabulary, including all tokens and alphabet. +/// +/// min_frequency: unsigned int: +/// The minimum frequency a pair should have in order to be merged. +/// +/// show_progress: boolean: +/// Whether to show progress bars while training. +/// +/// special_tokens: List[Union[str, AddedToken]]: +/// A list of special tokens the model should know of. +/// +/// limit_alphabet: unsigned int: +/// The maximum different characters to keep in the alphabet. +/// +/// initial_alphabet: List[str]: +/// A list of characters to include in the initial alphabet, even +/// if not seen in the training dataset. +/// If the strings contain more than one character, only the first one +/// is kept. +/// +/// continuing_subword_prefix: Optional[str]: +/// A prefix to be used for every subword that is not a beginning-of-word. +/// +/// end_of_word_suffix: Optional[str]: +/// A suffix to be used for every subword that is a end-of-word. +/// +/// Returns: +/// Trainer #[pyclass(name=Trainer)] +#[text_signature = "(self, vocab_size=30000, min_frequency=0,show_progress=True, special_tokens=[],limit_alphabet=None, initial_alphabet = [], continuing_subword_prefix=None, end_of_word_suffix=None)"] pub struct PyTrainer { pub trainer: TrainerWrapper, } @@ -41,6 +77,7 @@ impl Trainer for PyTrainer { } } +/// Capable of training a BPE model #[pyclass(extends=PyTrainer, name=BpeTrainer)] pub struct PyBpeTrainer {} #[pymethods] @@ -105,7 +142,39 @@ impl PyBpeTrainer { } } +/// Capable of training a WordPiece model +/// Args: +/// vocab_size: unsigned int: +/// The size of the final vocabulary, including all tokens and alphabet. +/// +/// min_frequency: unsigned int: +/// The minimum frequency a pair should have in order to be merged. +/// +/// show_progress: boolean: +/// Whether to show progress bars while training. +/// +/// special_tokens: List[Union[str, AddedToken]]: +/// A list of special tokens the model should know of. +/// +/// limit_alphabet: unsigned int: +/// The maximum different characters to keep in the alphabet. +/// +/// initial_alphabet: List[str]: +/// A list of characters to include in the initial alphabet, even +/// if not seen in the training dataset. +/// If the strings contain more than one character, only the first one +/// is kept. +/// +/// continuing_subword_prefix: Optional[str]: +/// A prefix to be used for every subword that is not a beginning-of-word. +/// +/// end_of_word_suffix: Optional[str]: +/// A suffix to be used for every subword that is a end-of-word. +/// +/// Returns: +/// Trainer #[pyclass(extends=PyTrainer, name=WordPieceTrainer)] +#[text_signature = "(self, vocab_size=30000, min_frequency=0, show_progress=True, special_tokens=[], limit_alphabet=None, initial_alphabet= [],continuing_subword_prefix=\"##\", end_of_word_suffix=None)"] pub struct PyWordPieceTrainer {} #[pymethods] impl PyWordPieceTrainer { @@ -173,7 +242,28 @@ impl PyWordPieceTrainer { } } +/// Capable of training a Unigram model +/// +/// Args: +/// vocab_size: unsigned int: +/// The size of the final vocabulary, including all tokens and alphabet. +/// +/// show_progress: boolean: +/// Whether to show progress bars while training. +/// +/// special_tokens: List[Union[str, AddedToken]]: +/// A list of special tokens the model should know of. +/// +/// initial_alphabet: List[str]: +/// A list of characters to include in the initial alphabet, even +/// if not seen in the training dataset. +/// If the strings contain more than one character, only the first one +/// is kept. +/// +/// Returns: +/// Trainer #[pyclass(extends=PyTrainer, name=UnigramTrainer)] +#[text_signature = "(self, vocab_size=8000, show_progress=True, special_tokens= [])"] pub struct PyUnigramTrainer {} #[pymethods] impl PyUnigramTrainer { diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index 9b25dce4..59746af1 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -173,6 +173,15 @@ fn slice( .flatten()) } +/// NormalizedString +/// +/// A NormalizedString takes care of modifying an "original" string, to obtain a "normalized" one. +/// While making all the requested modifications, it keeps track of the alignment information +/// between the two versions of the string. +/// +/// Args: +/// sequence: str: +/// The string sequence used to initialize this NormalizedString #[pyclass(module = "tokenizers", name=NormalizedString)] #[derive(Clone)] pub struct PyNormalizedString { @@ -186,6 +195,7 @@ impl PyNormalizedString { NormalizedString::from(s).into() } + /// The normalized part of the string #[getter] fn get_normalized(&self) -> &str { self.normalized.get() @@ -196,70 +206,119 @@ impl PyNormalizedString { self.normalized.get_original() } + /// Runs the NFD normalization + #[text_signature = "(self)"] fn nfd(&mut self) { self.normalized.nfd(); } + /// Runs the NFKD normalization + #[text_signature = "(self)"] fn nfkd(&mut self) { self.normalized.nfkd(); } + /// Runs the NFC normalization + #[text_signature = "(self)"] fn nfc(&mut self) { self.normalized.nfc(); } + /// Runs the NFKC normalization + #[text_signature = "(self)"] fn nfkc(&mut self) { self.normalized.nfkc(); } + /// Lowercase the string + #[text_signature = "(self)"] fn lowercase(&mut self) { self.normalized.lowercase(); } + /// Uppercase the string + #[text_signature = "(self)"] fn uppercase(&mut self) { self.normalized.uppercase(); } + /// Prepend the given sequence to the string + #[text_signature = "(self, s)"] fn prepend(&mut self, s: &str) { self.normalized.prepend(s); } + /// Append the given sequence to the string + #[text_signature = "(self, s)"] fn append(&mut self, s: &str) { self.normalized.append(s); } + /// Strip the left of the string + #[text_signature = "(self)"] fn lstrip(&mut self) { self.normalized.lstrip(); } + /// Strip the right of the string + #[text_signature = "(self)"] fn rstrip(&mut self) { self.normalized.rstrip(); } + /// Strip both ends of the string + #[text_signature = "(self)"] fn strip(&mut self) { self.normalized.strip(); } + /// Clears the string + #[text_signature = "(self)"] fn clear(&mut self) { self.normalized.clear(); } + /// Slice the string using the given range + #[text_signature = "(self, range)"] fn slice(&self, range: PyRange) -> PyResult> { slice(&self.normalized, &range) } + /// Filter each character of the string using the given func + #[text_signature = "(self, func)"] fn filter(&mut self, func: &PyAny) -> PyResult<()> { filter(&mut self.normalized, func) } + /// Calls the given function for each character of the string + #[text_signature = "(self, func)"] fn for_each(&self, func: &PyAny) -> PyResult<()> { for_each(&self.normalized, func) } + /// Calls the given function for each character of the string + /// + /// Replaces each character of the string using the returned value. Each + /// returned value **must** be a str of length 1 (ie a character). + #[text_signature = "(self, func)"] fn map(&mut self, func: &PyAny) -> PyResult<()> { map(&mut self.normalized, func) } + /// Split the NormalizedString using the given pattern and the specified behavior + /// + /// Args: + /// pattern: Pattern: + /// A pattern used to split the string. Usually a string or a Regex + /// + /// behavior: SplitDelimiterBehavior: + /// The behavior to use when splitting. + /// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", + /// "contiguous" + /// + /// Returns: + /// A list of NormalizedString, representing each split + #[text_signature = "(self, pattern, behavior)"] fn split( &mut self, pattern: PyPattern, @@ -272,6 +331,15 @@ impl PyNormalizedString { .collect()) } + /// Replace the content of the given pattern with the provided content + /// + /// Args: + /// pattern: Pattern: + /// A pattern used to match the string. Usually a string or a Regex + /// + /// content: str: + /// The content to be used as replacement + #[text_signature = "(self, pattern, content)"] fn replace(&mut self, pattern: PyPattern, content: &str) -> PyResult<()> { ToPyResult(self.normalized.replace(pattern, content)).into() } diff --git a/bindings/python/src/utils/pretokenization.rs b/bindings/python/src/utils/pretokenization.rs index c164fe5b..b4d5a66a 100644 --- a/bindings/python/src/utils/pretokenization.rs +++ b/bindings/python/src/utils/pretokenization.rs @@ -65,6 +65,7 @@ fn tokenize(pretok: &mut PreTokenizedString, func: &PyAny) -> PyResult<()> { } } +/// This is an enum #[derive(Clone)] pub struct PyOffsetReferential(OffsetReferential); impl FromPyObject<'_> for PyOffsetReferential { @@ -131,7 +132,23 @@ fn to_encoding( .into()) } +/// PreTokenizedString +/// +/// Wrapper over a string, that provides a way to normalize, pre-tokenize, tokenize the +/// underlying string, while keeping track of the alignment information (offsets). +/// +/// The PreTokenizedString manages what we call `splits`. Each split represents a substring +/// which is a subpart of the original string, with the relevant offsets and tokens. +/// +/// When calling one of the methods used to modify the PreTokenizedString (namely one of +/// `split`, `normalize` or `tokenize), only the `splits` that don't have any associated +/// tokens will get modified. +/// +/// Args: +/// sequence: str: +/// The string sequence used to initialize this PreTokenizedString #[pyclass(module = "tokenizers", name=PreTokenizedString)] +#[text_signature = "(self, sequence)"] pub struct PyPreTokenizedString { pub(crate) pretok: tk::PreTokenizedString, } @@ -155,27 +172,84 @@ impl PyPreTokenizedString { PreTokenizedString::from(s).into() } + /// Split the PreTokenizedString using the given `func` + /// + /// Args: + /// func: Callable[[index, NormalizedString], List[NormalizedString]]: + /// The function used to split each underlying split. + /// It is expected to return a list of `NormalizedString`, that represent the new + /// splits. If the given `NormalizedString` does not need any splitting, we can + /// just return it directly. + /// In order for the offsets to be tracked accurately, any returned `NormalizedString` + /// should come from calling either `.split` or `.slice` on the received one. + #[text_signature = "(self, func)"] fn split(&mut self, func: &PyAny) -> PyResult<()> { split(&mut self.pretok, func) } + /// Normalize each split of the `PreTokenizedString` using the given `func` + /// + /// Args: + /// func: Callable[[NormalizedString], None]: + /// The function used to normalize each underlying split. This function + /// does not need to return anything, just calling the methods on the provided + /// NormalizedString allow its modification. + #[text_signature = "(self, func)"] fn normalize(&mut self, func: &PyAny) -> PyResult<()> { normalize(&mut self.pretok, func) } + /// Tokenize each split of the `PreTokenizedString` using the given `func` + /// + /// Args: + /// func: Callable[[str], List[Token]]: + /// The function used to tokenize each underlying split. This function must return + /// a list of Token generated from the input str. + #[text_signature = "(self, func)"] fn tokenize(&mut self, func: &PyAny) -> PyResult<()> { tokenize(&mut self.pretok, func) } + /// Return an Encoding generated from this PreTokenizedString + /// + /// Args: + /// type_id: int = 0: + /// The type_id to be used on the generated Encoding. + /// + /// word_idx: Optional[int] = None: + /// An optional word index to be used for each token of this Encoding. If provided, + /// all the word indices in the generated Encoding will use this value, instead + /// of the one automatically tracked during pre-tokenization. + /// + /// Returns: + /// An Encoding #[args(type_id = "0", word_idx = "None")] + #[text_signature = "(self, type_id=0, word_idx=None)"] fn to_encoding(&self, type_id: u32, word_idx: Option) -> PyResult { to_encoding(&self.pretok, type_id, word_idx) } + /// Get the splits currently managed by the PreTokenizedString + /// + /// Args: + /// offset_referential: :obj:`str` + /// Whether the returned splits should have offsets expressed relative + /// to the original string, or the normalized one. choices: "original", "normalized". + /// + /// offset_type: :obj:`str` + /// Whether the returned splits should have offsets expressed in bytes or chars. + /// When slicing an str, we usually want to use chars, which is the default value. + /// Now in some cases it might be interesting to get these offsets expressed in bytes, + /// so it is possible to change this here. + /// choices: "char", "bytes" + /// + /// Returns + /// A list of splits #[args( offset_referential = "PyOffsetReferential(OffsetReferential::Original)", offset_type = "PyOffsetType(OffsetType::Char)" )] + #[text_signature = "(self, offset_referential=\"original\", offset_type=\"char\")"] fn get_splits( &self, offset_referential: PyOffsetReferential, diff --git a/bindings/python/src/utils/regex.rs b/bindings/python/src/utils/regex.rs index 998a9fd9..8170ffc3 100644 --- a/bindings/python/src/utils/regex.rs +++ b/bindings/python/src/utils/regex.rs @@ -2,7 +2,9 @@ use onig::Regex; use pyo3::exceptions; use pyo3::prelude::*; +/// Instantiate a new Regex with the given pattern #[pyclass(module = "tokenizers", name=Regex)] +#[text_signature = "(self, pattern)"] pub struct PyRegex { pub inner: Regex, pub pattern: String, diff --git a/bindings/python/stub.py b/bindings/python/stub.py new file mode 100644 index 00000000..3a5436d9 --- /dev/null +++ b/bindings/python/stub.py @@ -0,0 +1,192 @@ +import inspect +import os +import argparse +import black +from pathlib import Path + +INDENT = " " * 4 +GENERATED_COMMENT = "# Generated content DO NOT EDIT\n" + + +def do_indent(text: str, indent: str): + return text.replace("\n", f"\n{indent}") + + +def function(obj, indent, text_signature=None): + if text_signature is None: + text_signature = obj.__text_signature__ + string = "" + string += f"{indent}def {obj.__name__}{text_signature}:\n" + indent += INDENT + string += f'{indent}"""\n' + string += f"{indent}{do_indent(obj.__doc__, indent)}\n" + string += f'{indent}"""\n' + string += f"{indent}pass\n" + string += "\n" + string += "\n" + return string + + +def member_sort(member): + if inspect.isclass(member): + value = 10 + len(inspect.getmro(member)) + else: + value = 1 + return value + + +def fn_predicate(obj): + value = inspect.ismethoddescriptor(obj) or inspect.isbuiltin(obj) + if value: + return obj.__doc__ and obj.__text_signature__ and not obj.__name__.startswith("_") + if inspect.isgetsetdescriptor(obj): + return obj.__doc__ and not obj.__name__.startswith("_") + return False + + +def get_module_members(module): + members = [ + member + for name, member in inspect.getmembers(module) + if not name.startswith("_") and not inspect.ismodule(member) + ] + members.sort(key=member_sort) + return members + + +def pyi_file(obj, indent=""): + string = "" + if inspect.ismodule(obj): + string += GENERATED_COMMENT + members = get_module_members(obj) + for member in members: + string += pyi_file(member, indent) + + elif inspect.isclass(obj): + indent += INDENT + mro = inspect.getmro(obj) + if len(mro) > 2: + inherit = f"({mro[1].__name__})" + else: + inherit = "" + string += f"class {obj.__name__}{inherit}:\n" + + body = "" + if obj.__doc__: + body += f'{indent}"""\n{indent}{do_indent(obj.__doc__, indent)}\n{indent}"""\n' + + fns = inspect.getmembers(obj, fn_predicate) + + # Init + if obj.__text_signature__: + body += f"{indent}def __init__{obj.__text_signature__}:\n" + body += f"{indent+INDENT}pass\n" + body += "\n" + + for (name, fn) in fns: + body += pyi_file(fn, indent=indent) + + if not body: + body += f"{indent}pass\n" + + string += body + string += "\n\n" + + elif inspect.isbuiltin(obj): + string += f"{indent}@staticmethod\n" + string += function(obj, indent) + + elif inspect.ismethoddescriptor(obj): + string += function(obj, indent) + + elif inspect.isgetsetdescriptor(obj): + # TODO it would be interesing to add the setter maybe ? + string += f"{indent}@property\n" + string += function(obj, indent, text_signature="(self)") + else: + raise Exception(f"Object {obj} is not supported") + return string + + +def py_file(module, origin): + members = get_module_members(module) + + string = GENERATED_COMMENT + string += f"from .. import {origin}\n" + string += "\n" + for member in members: + name = member.__name__ + string += f"{name} = {origin}.{name}\n" + return string + + +def do_black(content, is_pyi): + mode = black.Mode( + target_versions={black.TargetVersion.PY35}, + line_length=100, + is_pyi=is_pyi, + string_normalization=True, + experimental_string_processing=False, + ) + try: + return black.format_file_contents(content, fast=True, mode=mode) + except black.NothingChanged: + return content + + +def write(module, directory, origin, check=False): + submodules = [ + (name, member) for name, member in inspect.getmembers(module) if inspect.ismodule(member) + ] + + filename = os.path.join(directory, "__init__.pyi") + pyi_content = pyi_file(module) + pyi_content = do_black(pyi_content, is_pyi=True) + os.makedirs(directory, exist_ok=True) + if check: + with open(filename, "r") as f: + data = f.read() + assert ( + data == pyi_content + ), f"The content of {filename} seems outdated, please run `python stub.py`" + else: + with open(filename, "w") as f: + f.write(pyi_content) + + filename = os.path.join(directory, "__init__.py") + py_content = py_file(module, origin) + py_content = do_black(py_content, is_pyi=False) + os.makedirs(directory, exist_ok=True) + + is_auto = False + if not os.path.exists(filename): + is_auto = True + else: + with open(filename, "r") as f: + line = f.readline() + if line == GENERATED_COMMENT: + is_auto = True + + if is_auto: + if check: + with open(filename, "r") as f: + data = f.read() + assert ( + data == py_content + ), f"The content of {filename} seems outdated, please run `python stub.py`" + else: + with open(filename, "w") as f: + f.write(py_content) + + for name, submodule in submodules: + write(submodule, os.path.join(directory, name), f"{name}", check=check) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--check", action="store_true") + + args = parser.parse_args() + import tokenizers + + write(tokenizers.tokenizers, "py_src/tokenizers/", "tokenizers", check=args.check)