From 63063118dfd1d4156c4eca70b4d35430e51ce8d6 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Tue, 7 Jan 2020 16:20:20 -0500 Subject: [PATCH] Python - Adding tokenizers classes - WIP --- bindings/python/setup.py | 4 +- bindings/python/tokenizers/__init__.py | 1 + bindings/python/tokenizers/__init__.pyi | 2 + .../tokenizers/implementations/__init__.py | 2 + .../implementations/base_tokenizer.py | 185 ++++++++++++++++++ .../implementations/byte_level_bpe.py | 27 +++ 6 files changed, 220 insertions(+), 1 deletion(-) create mode 100644 bindings/python/tokenizers/implementations/__init__.py create mode 100644 bindings/python/tokenizers/implementations/base_tokenizer.py create mode 100644 bindings/python/tokenizers/implementations/byte_level_bpe.py diff --git a/bindings/python/setup.py b/bindings/python/setup.py index b86f2f6e..42bf0909 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -20,7 +20,8 @@ setup( "tokenizers.normalizers", "tokenizers.pre_tokenizers", "tokenizers.processors", - "tokenizers.trainers" + "tokenizers.trainers", + "tokenizers.implementations", ], package_data = { 'tokenizers': [ 'py.typed', '__init__.pyi' ], @@ -30,6 +31,7 @@ setup( 'tokenizers.pre_tokenizers': [ 'py.typed', '__init__.pyi' ], 'tokenizers.processors': [ 'py.typed', '__init__.pyi' ], 'tokenizers.trainers': [ 'py.typed', '__init__.pyi' ], + 'tokenizers.implementations': [ 'py.typed' ], }, zip_safe=False, ) diff --git a/bindings/python/tokenizers/__init__.py b/bindings/python/tokenizers/__init__.py index 044bb1c1..8c94c6f4 100644 --- a/bindings/python/tokenizers/__init__.py +++ b/bindings/python/tokenizers/__init__.py @@ -7,3 +7,4 @@ from .tokenizers import normalizers from .tokenizers import pre_tokenizers from .tokenizers import processors from .tokenizers import trainers +from .implementations import ByteLevelBpe diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi index 1b912632..0dc97d57 100644 --- a/bindings/python/tokenizers/__init__.pyi +++ b/bindings/python/tokenizers/__init__.pyi @@ -5,6 +5,8 @@ from tokenizers import pre_tokenizers from tokenizers import processors from tokenizers import trainers +from tokenizers.implementations import ByteLevelBpe + from typing import Optional, Union, List, Tuple Offsets = Tuple[int, int] diff --git a/bindings/python/tokenizers/implementations/__init__.py b/bindings/python/tokenizers/implementations/__init__.py new file mode 100644 index 00000000..0ae2721d --- /dev/null +++ b/bindings/python/tokenizers/implementations/__init__.py @@ -0,0 +1,2 @@ +from .base_tokenizer import BaseTokenizer +from .byte_level_bpe import ByteLevelBpe diff --git a/bindings/python/tokenizers/implementations/base_tokenizer.py b/bindings/python/tokenizers/implementations/base_tokenizer.py new file mode 100644 index 00000000..8a7d8854 --- /dev/null +++ b/bindings/python/tokenizers/implementations/base_tokenizer.py @@ -0,0 +1,185 @@ +from .tokenizers import Tokenizer + +from typing import List, Union, Tuple + +class BaseTokenizer: + _tokenizer: Tokenizer + + def __init__(self, tokenizer: Tokenizer): + self._tokenizer = tokenizer + + def with_padding(self, + direction: Optional[str] = "right", + pad_id: Optional[int] = 0, + pad_type_id: Optional[int] = 0, + pad_token: Optional[str] = "[PAD]", + max_length: Optional[int] = None): + """ Change the padding strategy + + Args: + direction: (`optional`) str: + Can be one of: `right` or `left` + + pad_id: (`optional`) unsigned int: + The indice to be used when padding + + pad_type_id: (`optional`) unsigned int: + The type indice to be used when padding + + pad_token: (`optional`) str: + The pad token to be used when padding + + max_length: (`optional`) unsigned int: + If specified, the length at which to pad. If not specified + we pad using the size of the longest sequence in a batch + """ + return self._tokenizer.with_padding(direction=direction, + pad_id=pad_id, + pad_type_id=pad_type_id, + pad_token=pad_token, + max_length=max_length) + + def without_padding(self): + """ Disable padding """ + return self._tokenizer.without_padding() + + def with_truncation(self, + max_length: int, + stride: Optional[int], + strategy: Optional[str]): + """ Change the truncation options + + Args: + max_length: unsigned int: + The maximum length at which to truncate + + stride: (`optional`) unsigned int: + The length of the previous first sequence to be included + in the overflowing sequence + + strategy: (`optional) str: + Can be one of `longest_first`, `only_first` or `only_second` + """ + return self._tokenizer.with_truncation(max_length, + stride=stride, + strategy=strategy) + + def without_truncation(self): + """ Disable truncation """ + return self._tokenizer.without_truncation() + + + def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int: + """ Add the given tokens to the vocabulary + + Args: + tokens: List[Union[str, Tuple[str, bool]]]: + A list of tokens to add to the vocabulary. Each token can either be + a string, or a tuple with a string representing the token, and a boolean + option representing whether to match on single words only. + If the boolean is not included, it defaults to False + + Returns: + The number of tokens that were added to the vocabulary + """ + return self._tokenizer.add_tokens(tokens) + + def add_special_tokens(self, special_tokens: List[str]) -> int: + """ Add the given special tokens to the vocabulary, and treat them as special tokens. + + The special tokens will never be processed by the model, and will be + removed while decoding. + + Args: + tokens: List[str]: + The list of special tokens to add + + Returns: + The number of tokens that were added to the vocabulary + """ + return self._tokenizer.add_special_tokens(tokens) + + def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding: + """ Encode the given sequence + + Args: + sequence: str: + The sequence to encode + + pair: (`optional`) Optional[str]: + The optional pair sequence + + Returns: + An Encoding + """ + return self._tokenizer.encode(sequence, pair) + + def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]: + """ Encode the given sequences or pair of sequences + + Args: + sequences: List[Union[str, Tuple[str, str]]]: + A list of sequences or pair of sequences. The list can contain both + at the same time. + + Returns: + A list of Encoding + """ + return self._tokenizer.encode_batch(sequences) + + def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str: + """ Decode the given list of ids to a string sequence + + Args: + ids: List[unsigned int]: + A list of ids to be decoded + + skip_special_tokens: (`optional`) boolean: + Whether to remove all the special tokens from the output string + + Returns: + The decoded string + """ + return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens) + + def decode_batch(self, + sequences: List[List[int]], + skip_special_tokens: Optional[bool] = True) -> str: + """ Decode the list of sequences to a list of string sequences + + Args: + sequences: List[List[unsigned int]]: + A list of sequence of ids to be decoded + + skip_special_tokens: (`optional`) boolean: + Whether to remove all the special tokens from the output strings + + Returns: + A list of decoded strings + """ + return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens) + + def token_to_id(self, token: str) -> Optional[int]: + """ Convert the given token to its corresponding id + + Args: + token: str: + The token to convert + + Returns: + The corresponding id if it exists, None otherwise + """ + return self._tokenizer.token_to_id(token) + + def id_to_token(self, id: int) -> Optional[str]: + """ Convert the given token id to its corresponding string + + Args: + token: id: + The token id to convert + + Returns: + The corresponding string if it exists, None otherwise + """ + return self._tokenizer.id_to_token(id) + diff --git a/bindings/python/tokenizers/implementations/byte_level_bpe.py b/bindings/python/tokenizers/implementations/byte_level_bpe.py new file mode 100644 index 00000000..aceee62e --- /dev/null +++ b/bindings/python/tokenizers/implementations/byte_level_bpe.py @@ -0,0 +1,27 @@ +from tokenizers import Tokenizer, pre_tokenizers, decoders +from tokenizers.tokenizers import BaseTokenizer +from tokenizers.models import BPE +from tokenizers.normalizers import NFKC + +from typing import Optional + +class ByteLevelBpe(BaseTokenizer): + """ ByteLevelBpe + + Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model + """ + + def __init__(self, + vocab_file: Optional[str]=None, + merges_file: Optional[str]=None, + add_prefix_space: boolean=False): + if vocab_file is not None and merges_file is not None: + tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file)) + else: + tokenizer = Tokenizer(BPE.empty()) + + tokenizer.normalizer = NFKC.new() + tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space) + tokenizer.decoder = decoders.ByteLevel.new() + + super().__init__(tokenizer)