Python - Adding tokenizers classes - WIP

2025-08-22 16:25:30 +00:00 · 2020-01-07 16:20:20 -05:00
parent 6294d342d5
commit 63063118df
6 changed files with 220 additions and 1 deletions
--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@ -20,7 +20,8 @@ setup(
        "tokenizers.normalizers",
        "tokenizers.pre_tokenizers",
        "tokenizers.processors",
-        "tokenizers.trainers"
+        "tokenizers.trainers",
+        "tokenizers.implementations",
    ],
    package_data = {
        'tokenizers': [ 'py.typed', '__init__.pyi' ],
@ -30,6 +31,7 @@ setup(
        'tokenizers.pre_tokenizers': [ 'py.typed', '__init__.pyi' ],
        'tokenizers.processors': [ 'py.typed', '__init__.pyi' ],
        'tokenizers.trainers': [ 'py.typed', '__init__.pyi' ],
+        'tokenizers.implementations': [ 'py.typed' ],
    },
    zip_safe=False,
 )
--- a/bindings/python/tokenizers/init.py
+++ b/bindings/python/tokenizers/init.py
@ -7,3 +7,4 @@ from .tokenizers import normalizers
 from .tokenizers import pre_tokenizers
 from .tokenizers import processors
 from .tokenizers import trainers
+from .implementations import ByteLevelBpe
--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@ -5,6 +5,8 @@ from tokenizers import pre_tokenizers
 from tokenizers import processors
 from tokenizers import trainers

+from tokenizers.implementations import ByteLevelBpe
+
 from typing import Optional, Union, List, Tuple

 Offsets = Tuple[int, int]
--- a/bindings/python/tokenizers/implementations/init.py
+++ b/bindings/python/tokenizers/implementations/init.py
@ -0,0 +1,2 @@
+from .base_tokenizer import BaseTokenizer
+from .byte_level_bpe import ByteLevelBpe
--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@ -0,0 +1,185 @@
+from .tokenizers import Tokenizer
+
+from typing import List, Union, Tuple
+
+class BaseTokenizer:
+    _tokenizer: Tokenizer
+
+    def __init__(self, tokenizer: Tokenizer):
+        self._tokenizer = tokenizer
+
+    def with_padding(self,
+                     direction: Optional[str] = "right",
+                     pad_id: Optional[int] = 0,
+                     pad_type_id: Optional[int] = 0,
+                     pad_token: Optional[str] = "[PAD]",
+                     max_length: Optional[int] = None):
+        """ Change the padding strategy
+
+        Args:
+            direction: (`optional`) str:
+                Can be one of: `right` or `left`
+
+            pad_id: (`optional`) unsigned int:
+                The indice to be used when padding
+
+            pad_type_id: (`optional`) unsigned int:
+                The type indice to be used when padding
+
+            pad_token: (`optional`) str:
+                The pad token to be used when padding
+
+            max_length: (`optional`) unsigned int:
+                If specified, the length at which to pad. If not specified
+                we pad using the size of the longest sequence in a batch
+        """
+        return self._tokenizer.with_padding(direction=direction,
+                                            pad_id=pad_id,
+                                            pad_type_id=pad_type_id,
+                                            pad_token=pad_token,
+                                            max_length=max_length)
+
+    def without_padding(self):
+        """ Disable padding """
+        return self._tokenizer.without_padding()
+
+    def with_truncation(self,
+                        max_length: int,
+                        stride: Optional[int],
+                        strategy: Optional[str]):
+        """ Change the truncation options
+
+        Args:
+            max_length: unsigned int:
+                The maximum length at which to truncate
+
+            stride: (`optional`) unsigned int:
+                The length of the previous first sequence to be included
+                in the overflowing sequence
+
+            strategy: (`optional) str:
+                Can be one of `longest_first`, `only_first` or `only_second`
+        """
+        return self._tokenizer.with_truncation(max_length,
+                                               stride=stride,
+                                               strategy=strategy)
+
+    def without_truncation(self):
+        """ Disable truncation """
+        return self._tokenizer.without_truncation()
+
+
+    def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int:
+        """ Add the given tokens to the vocabulary
+
+        Args:
+            tokens: List[Union[str, Tuple[str, bool]]]:
+                A list of tokens to add to the vocabulary. Each token can either be
+                a string, or a tuple with a string representing the token, and a boolean
+                option representing whether to match on single words only.
+                If the boolean is not included, it defaults to False
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        return self._tokenizer.add_tokens(tokens)
+
+    def add_special_tokens(self, special_tokens: List[str]) -> int:
+        """ Add the given special tokens to the vocabulary, and treat them as special tokens.
+
+        The special tokens will never be processed by the model, and will be
+        removed while decoding.
+
+        Args:
+            tokens: List[str]:
+                The list of special tokens to add
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        return self._tokenizer.add_special_tokens(tokens)
+
+    def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
+        """ Encode the given sequence
+
+        Args:
+            sequence: str:
+                The sequence to encode
+
+            pair: (`optional`) Optional[str]:
+                The optional pair sequence
+
+        Returns:
+            An Encoding
+        """
+        return self._tokenizer.encode(sequence, pair)
+
+    def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
+        """ Encode the given sequences or pair of sequences
+
+        Args:
+            sequences: List[Union[str, Tuple[str, str]]]:
+                A list of sequences or pair of sequences. The list can contain both
+                at the same time.
+
+        Returns:
+            A list of Encoding
+        """
+        return self._tokenizer.encode_batch(sequences)
+
+    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
+        """ Decode the given list of ids to a string sequence
+
+        Args:
+            ids: List[unsigned int]:
+                A list of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output string
+
+        Returns:
+            The decoded string
+        """
+        return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+
+    def decode_batch(self,
+                     sequences: List[List[int]],
+                     skip_special_tokens: Optional[bool] = True) -> str:
+        """ Decode the list of sequences to a list of string sequences
+
+        Args:
+            sequences: List[List[unsigned int]]:
+                A list of sequence of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output strings
+
+        Returns:
+            A list of decoded strings
+        """
+        return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
+
+    def token_to_id(self, token: str) -> Optional[int]:
+        """ Convert the given token to its corresponding id
+
+        Args:
+            token: str:
+                The token to convert
+
+        Returns:
+            The corresponding id if it exists, None otherwise
+        """
+        return self._tokenizer.token_to_id(token)
+
+    def id_to_token(self, id: int) -> Optional[str]:
+        """ Convert the given token id to its corresponding string
+
+        Args:
+            token: id:
+                The token id to convert
+
+        Returns:
+            The corresponding string if it exists, None otherwise
+        """
+        return self._tokenizer.id_to_token(id)
+
--- a/bindings/python/tokenizers/implementations/byte_level_bpe.py
+++ b/bindings/python/tokenizers/implementations/byte_level_bpe.py
@ -0,0 +1,27 @@
+from tokenizers import Tokenizer, pre_tokenizers, decoders
+from tokenizers.tokenizers import BaseTokenizer
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFKC
+
+from typing import Optional
+
+class ByteLevelBpe(BaseTokenizer):
+    """ ByteLevelBpe
+
+    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
+    """
+
+    def __init__(self,
+                 vocab_file: Optional[str]=None,
+                 merges_file: Optional[str]=None,
+                 add_prefix_space: boolean=False):
+        if vocab_file is not None and merges_file is not None:
+            tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file))
+        else:
+            tokenizer = Tokenizer(BPE.empty())
+
+        tokenizer.normalizer = NFKC.new()
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
+        tokenizer.decoder = decoders.ByteLevel.new()
+
+        super().__init__(tokenizer)