From 63063118dfd1d4156c4eca70b4d35430e51ce8d6 Mon Sep 17 00:00:00 2001
From: Anthony MOI <m.anthony.moi@gmail.com>
Date: Tue, 7 Jan 2020 16:20:20 -0500
Subject: [PATCH] Python - Adding tokenizers classes - WIP

---
 bindings/python/setup.py                      |   4 +-
 bindings/python/tokenizers/__init__.py        |   1 +
 bindings/python/tokenizers/__init__.pyi       |   2 +
 .../tokenizers/implementations/__init__.py    |   2 +
 .../implementations/base_tokenizer.py         | 185 ++++++++++++++++++
 .../implementations/byte_level_bpe.py         |  27 +++
 6 files changed, 220 insertions(+), 1 deletion(-)
 create mode 100644 bindings/python/tokenizers/implementations/__init__.py
 create mode 100644 bindings/python/tokenizers/implementations/base_tokenizer.py
 create mode 100644 bindings/python/tokenizers/implementations/byte_level_bpe.py

diff --git a/bindings/python/setup.py b/bindings/python/setup.py
index b86f2f6e..42bf0909 100644
--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@@ -20,7 +20,8 @@ setup(
         "tokenizers.normalizers",
         "tokenizers.pre_tokenizers",
         "tokenizers.processors",
-        "tokenizers.trainers"
+        "tokenizers.trainers",
+        "tokenizers.implementations",
     ],
     package_data = {
         'tokenizers': [ 'py.typed', '__init__.pyi' ],
@@ -30,6 +31,7 @@ setup(
         'tokenizers.pre_tokenizers': [ 'py.typed', '__init__.pyi' ],
         'tokenizers.processors': [ 'py.typed', '__init__.pyi' ],
         'tokenizers.trainers': [ 'py.typed', '__init__.pyi' ],
+        'tokenizers.implementations': [ 'py.typed' ],
     },
     zip_safe=False,
 )
diff --git a/bindings/python/tokenizers/__init__.py b/bindings/python/tokenizers/__init__.py
index 044bb1c1..8c94c6f4 100644
--- a/bindings/python/tokenizers/__init__.py
+++ b/bindings/python/tokenizers/__init__.py
@@ -7,3 +7,4 @@ from .tokenizers import normalizers
 from .tokenizers import pre_tokenizers
 from .tokenizers import processors
 from .tokenizers import trainers
+from .implementations import ByteLevelBpe
diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi
index 1b912632..0dc97d57 100644
--- a/bindings/python/tokenizers/__init__.pyi
+++ b/bindings/python/tokenizers/__init__.pyi
@@ -5,6 +5,8 @@ from tokenizers import pre_tokenizers
 from tokenizers import processors
 from tokenizers import trainers
 
+from tokenizers.implementations import ByteLevelBpe
+
 from typing import Optional, Union, List, Tuple
 
 Offsets = Tuple[int, int]
diff --git a/bindings/python/tokenizers/implementations/__init__.py b/bindings/python/tokenizers/implementations/__init__.py
new file mode 100644
index 00000000..0ae2721d
--- /dev/null
+++ b/bindings/python/tokenizers/implementations/__init__.py
@@ -0,0 +1,2 @@
+from .base_tokenizer import BaseTokenizer
+from .byte_level_bpe import ByteLevelBpe
diff --git a/bindings/python/tokenizers/implementations/base_tokenizer.py b/bindings/python/tokenizers/implementations/base_tokenizer.py
new file mode 100644
index 00000000..8a7d8854
--- /dev/null
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@@ -0,0 +1,185 @@
+from .tokenizers import Tokenizer
+
+from typing import List, Union, Tuple
+
+class BaseTokenizer:
+    _tokenizer: Tokenizer
+
+    def __init__(self, tokenizer: Tokenizer):
+        self._tokenizer = tokenizer
+
+    def with_padding(self,
+                     direction: Optional[str] = "right",
+                     pad_id: Optional[int] = 0,
+                     pad_type_id: Optional[int] = 0,
+                     pad_token: Optional[str] = "[PAD]",
+                     max_length: Optional[int] = None):
+        """ Change the padding strategy
+
+        Args:
+            direction: (`optional`) str:
+                Can be one of: `right` or `left`
+
+            pad_id: (`optional`) unsigned int:
+                The indice to be used when padding
+
+            pad_type_id: (`optional`) unsigned int:
+                The type indice to be used when padding
+
+            pad_token: (`optional`) str:
+                The pad token to be used when padding
+
+            max_length: (`optional`) unsigned int:
+                If specified, the length at which to pad. If not specified
+                we pad using the size of the longest sequence in a batch
+        """
+        return self._tokenizer.with_padding(direction=direction,
+                                            pad_id=pad_id,
+                                            pad_type_id=pad_type_id,
+                                            pad_token=pad_token,
+                                            max_length=max_length)
+
+    def without_padding(self):
+        """ Disable padding """
+        return self._tokenizer.without_padding()
+
+    def with_truncation(self,
+                        max_length: int,
+                        stride: Optional[int],
+                        strategy: Optional[str]):
+        """ Change the truncation options
+
+        Args:
+            max_length: unsigned int:
+                The maximum length at which to truncate
+
+            stride: (`optional`) unsigned int:
+                The length of the previous first sequence to be included
+                in the overflowing sequence
+
+            strategy: (`optional) str:
+                Can be one of `longest_first`, `only_first` or `only_second`
+        """
+        return self._tokenizer.with_truncation(max_length,
+                                               stride=stride,
+                                               strategy=strategy)
+
+    def without_truncation(self):
+        """ Disable truncation """
+        return self._tokenizer.without_truncation()
+
+
+    def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int:
+        """ Add the given tokens to the vocabulary
+
+        Args:
+            tokens: List[Union[str, Tuple[str, bool]]]:
+                A list of tokens to add to the vocabulary. Each token can either be
+                a string, or a tuple with a string representing the token, and a boolean
+                option representing whether to match on single words only.
+                If the boolean is not included, it defaults to False
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        return self._tokenizer.add_tokens(tokens)
+
+    def add_special_tokens(self, special_tokens: List[str]) -> int:
+        """ Add the given special tokens to the vocabulary, and treat them as special tokens.
+
+        The special tokens will never be processed by the model, and will be
+        removed while decoding.
+
+        Args:
+            tokens: List[str]:
+                The list of special tokens to add
+
+        Returns:
+            The number of tokens that were added to the vocabulary
+        """
+        return self._tokenizer.add_special_tokens(tokens)
+
+    def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
+        """ Encode the given sequence
+
+        Args:
+            sequence: str:
+                The sequence to encode
+
+            pair: (`optional`) Optional[str]:
+                The optional pair sequence
+
+        Returns:
+            An Encoding
+        """
+        return self._tokenizer.encode(sequence, pair)
+
+    def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
+        """ Encode the given sequences or pair of sequences
+
+        Args:
+            sequences: List[Union[str, Tuple[str, str]]]:
+                A list of sequences or pair of sequences. The list can contain both
+                at the same time.
+
+        Returns:
+            A list of Encoding
+        """
+        return self._tokenizer.encode_batch(sequences)
+
+    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
+        """ Decode the given list of ids to a string sequence
+
+        Args:
+            ids: List[unsigned int]:
+                A list of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output string
+
+        Returns:
+            The decoded string
+        """
+        return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
+
+    def decode_batch(self,
+                     sequences: List[List[int]],
+                     skip_special_tokens: Optional[bool] = True) -> str:
+        """ Decode the list of sequences to a list of string sequences
+
+        Args:
+            sequences: List[List[unsigned int]]:
+                A list of sequence of ids to be decoded
+
+            skip_special_tokens: (`optional`) boolean:
+                Whether to remove all the special tokens from the output strings
+
+        Returns:
+            A list of decoded strings
+        """
+        return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
+
+    def token_to_id(self, token: str) -> Optional[int]:
+        """ Convert the given token to its corresponding id
+
+        Args:
+            token: str:
+                The token to convert
+
+        Returns:
+            The corresponding id if it exists, None otherwise
+        """
+        return self._tokenizer.token_to_id(token)
+
+    def id_to_token(self, id: int) -> Optional[str]:
+        """ Convert the given token id to its corresponding string
+
+        Args:
+            token: id:
+                The token id to convert
+
+        Returns:
+            The corresponding string if it exists, None otherwise
+        """
+        return self._tokenizer.id_to_token(id)
+
diff --git a/bindings/python/tokenizers/implementations/byte_level_bpe.py b/bindings/python/tokenizers/implementations/byte_level_bpe.py
new file mode 100644
index 00000000..aceee62e
--- /dev/null
+++ b/bindings/python/tokenizers/implementations/byte_level_bpe.py
@@ -0,0 +1,27 @@
+from tokenizers import Tokenizer, pre_tokenizers, decoders
+from tokenizers.tokenizers import BaseTokenizer
+from tokenizers.models import BPE
+from tokenizers.normalizers import NFKC
+
+from typing import Optional
+
+class ByteLevelBpe(BaseTokenizer):
+    """ ByteLevelBpe
+
+    Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
+    """
+
+    def __init__(self,
+                 vocab_file: Optional[str]=None,
+                 merges_file: Optional[str]=None,
+                 add_prefix_space: boolean=False):
+        if vocab_file is not None and merges_file is not None:
+            tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file))
+        else:
+            tokenizer = Tokenizer(BPE.empty())
+
+        tokenizer.normalizer = NFKC.new()
+        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
+        tokenizer.decoder = decoders.ByteLevel.new()
+
+        super().__init__(tokenizer)