mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Adding tokenizers classes - WIP
This commit is contained in:
@ -20,7 +20,8 @@ setup(
|
||||
"tokenizers.normalizers",
|
||||
"tokenizers.pre_tokenizers",
|
||||
"tokenizers.processors",
|
||||
"tokenizers.trainers"
|
||||
"tokenizers.trainers",
|
||||
"tokenizers.implementations",
|
||||
],
|
||||
package_data = {
|
||||
'tokenizers': [ 'py.typed', '__init__.pyi' ],
|
||||
@ -30,6 +31,7 @@ setup(
|
||||
'tokenizers.pre_tokenizers': [ 'py.typed', '__init__.pyi' ],
|
||||
'tokenizers.processors': [ 'py.typed', '__init__.pyi' ],
|
||||
'tokenizers.trainers': [ 'py.typed', '__init__.pyi' ],
|
||||
'tokenizers.implementations': [ 'py.typed' ],
|
||||
},
|
||||
zip_safe=False,
|
||||
)
|
||||
|
@ -7,3 +7,4 @@ from .tokenizers import normalizers
|
||||
from .tokenizers import pre_tokenizers
|
||||
from .tokenizers import processors
|
||||
from .tokenizers import trainers
|
||||
from .implementations import ByteLevelBpe
|
||||
|
@ -5,6 +5,8 @@ from tokenizers import pre_tokenizers
|
||||
from tokenizers import processors
|
||||
from tokenizers import trainers
|
||||
|
||||
from tokenizers.implementations import ByteLevelBpe
|
||||
|
||||
from typing import Optional, Union, List, Tuple
|
||||
|
||||
Offsets = Tuple[int, int]
|
||||
|
2
bindings/python/tokenizers/implementations/__init__.py
Normal file
2
bindings/python/tokenizers/implementations/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
from .byte_level_bpe import ByteLevelBpe
|
185
bindings/python/tokenizers/implementations/base_tokenizer.py
Normal file
185
bindings/python/tokenizers/implementations/base_tokenizer.py
Normal file
@ -0,0 +1,185 @@
|
||||
from .tokenizers import Tokenizer
|
||||
|
||||
from typing import List, Union, Tuple
|
||||
|
||||
class BaseTokenizer:
|
||||
_tokenizer: Tokenizer
|
||||
|
||||
def __init__(self, tokenizer: Tokenizer):
|
||||
self._tokenizer = tokenizer
|
||||
|
||||
def with_padding(self,
|
||||
direction: Optional[str] = "right",
|
||||
pad_id: Optional[int] = 0,
|
||||
pad_type_id: Optional[int] = 0,
|
||||
pad_token: Optional[str] = "[PAD]",
|
||||
max_length: Optional[int] = None):
|
||||
""" Change the padding strategy
|
||||
|
||||
Args:
|
||||
direction: (`optional`) str:
|
||||
Can be one of: `right` or `left`
|
||||
|
||||
pad_id: (`optional`) unsigned int:
|
||||
The indice to be used when padding
|
||||
|
||||
pad_type_id: (`optional`) unsigned int:
|
||||
The type indice to be used when padding
|
||||
|
||||
pad_token: (`optional`) str:
|
||||
The pad token to be used when padding
|
||||
|
||||
max_length: (`optional`) unsigned int:
|
||||
If specified, the length at which to pad. If not specified
|
||||
we pad using the size of the longest sequence in a batch
|
||||
"""
|
||||
return self._tokenizer.with_padding(direction=direction,
|
||||
pad_id=pad_id,
|
||||
pad_type_id=pad_type_id,
|
||||
pad_token=pad_token,
|
||||
max_length=max_length)
|
||||
|
||||
def without_padding(self):
|
||||
""" Disable padding """
|
||||
return self._tokenizer.without_padding()
|
||||
|
||||
def with_truncation(self,
|
||||
max_length: int,
|
||||
stride: Optional[int],
|
||||
strategy: Optional[str]):
|
||||
""" Change the truncation options
|
||||
|
||||
Args:
|
||||
max_length: unsigned int:
|
||||
The maximum length at which to truncate
|
||||
|
||||
stride: (`optional`) unsigned int:
|
||||
The length of the previous first sequence to be included
|
||||
in the overflowing sequence
|
||||
|
||||
strategy: (`optional) str:
|
||||
Can be one of `longest_first`, `only_first` or `only_second`
|
||||
"""
|
||||
return self._tokenizer.with_truncation(max_length,
|
||||
stride=stride,
|
||||
strategy=strategy)
|
||||
|
||||
def without_truncation(self):
|
||||
""" Disable truncation """
|
||||
return self._tokenizer.without_truncation()
|
||||
|
||||
|
||||
def add_tokens(self, tokens: List[Union[str, Tuple[str, bool]]]) -> int:
|
||||
""" Add the given tokens to the vocabulary
|
||||
|
||||
Args:
|
||||
tokens: List[Union[str, Tuple[str, bool]]]:
|
||||
A list of tokens to add to the vocabulary. Each token can either be
|
||||
a string, or a tuple with a string representing the token, and a boolean
|
||||
option representing whether to match on single words only.
|
||||
If the boolean is not included, it defaults to False
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
"""
|
||||
return self._tokenizer.add_tokens(tokens)
|
||||
|
||||
def add_special_tokens(self, special_tokens: List[str]) -> int:
|
||||
""" Add the given special tokens to the vocabulary, and treat them as special tokens.
|
||||
|
||||
The special tokens will never be processed by the model, and will be
|
||||
removed while decoding.
|
||||
|
||||
Args:
|
||||
tokens: List[str]:
|
||||
The list of special tokens to add
|
||||
|
||||
Returns:
|
||||
The number of tokens that were added to the vocabulary
|
||||
"""
|
||||
return self._tokenizer.add_special_tokens(tokens)
|
||||
|
||||
def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
|
||||
""" Encode the given sequence
|
||||
|
||||
Args:
|
||||
sequence: str:
|
||||
The sequence to encode
|
||||
|
||||
pair: (`optional`) Optional[str]:
|
||||
The optional pair sequence
|
||||
|
||||
Returns:
|
||||
An Encoding
|
||||
"""
|
||||
return self._tokenizer.encode(sequence, pair)
|
||||
|
||||
def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
|
||||
""" Encode the given sequences or pair of sequences
|
||||
|
||||
Args:
|
||||
sequences: List[Union[str, Tuple[str, str]]]:
|
||||
A list of sequences or pair of sequences. The list can contain both
|
||||
at the same time.
|
||||
|
||||
Returns:
|
||||
A list of Encoding
|
||||
"""
|
||||
return self._tokenizer.encode_batch(sequences)
|
||||
|
||||
def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
|
||||
""" Decode the given list of ids to a string sequence
|
||||
|
||||
Args:
|
||||
ids: List[unsigned int]:
|
||||
A list of ids to be decoded
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output string
|
||||
|
||||
Returns:
|
||||
The decoded string
|
||||
"""
|
||||
return self._tokenizer.decode(ids, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
def decode_batch(self,
|
||||
sequences: List[List[int]],
|
||||
skip_special_tokens: Optional[bool] = True) -> str:
|
||||
""" Decode the list of sequences to a list of string sequences
|
||||
|
||||
Args:
|
||||
sequences: List[List[unsigned int]]:
|
||||
A list of sequence of ids to be decoded
|
||||
|
||||
skip_special_tokens: (`optional`) boolean:
|
||||
Whether to remove all the special tokens from the output strings
|
||||
|
||||
Returns:
|
||||
A list of decoded strings
|
||||
"""
|
||||
return self._tokenizer.decode_batch(sequences, skip_special_tokens=skip_special_tokens)
|
||||
|
||||
def token_to_id(self, token: str) -> Optional[int]:
|
||||
""" Convert the given token to its corresponding id
|
||||
|
||||
Args:
|
||||
token: str:
|
||||
The token to convert
|
||||
|
||||
Returns:
|
||||
The corresponding id if it exists, None otherwise
|
||||
"""
|
||||
return self._tokenizer.token_to_id(token)
|
||||
|
||||
def id_to_token(self, id: int) -> Optional[str]:
|
||||
""" Convert the given token id to its corresponding string
|
||||
|
||||
Args:
|
||||
token: id:
|
||||
The token id to convert
|
||||
|
||||
Returns:
|
||||
The corresponding string if it exists, None otherwise
|
||||
"""
|
||||
return self._tokenizer.id_to_token(id)
|
||||
|
27
bindings/python/tokenizers/implementations/byte_level_bpe.py
Normal file
27
bindings/python/tokenizers/implementations/byte_level_bpe.py
Normal file
@ -0,0 +1,27 @@
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||
from tokenizers.tokenizers import BaseTokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
|
||||
from typing import Optional
|
||||
|
||||
class ByteLevelBpe(BaseTokenizer):
|
||||
""" ByteLevelBpe
|
||||
|
||||
Represents a Byte-level BPE as introduced by OpenAI with their GPT-2 model
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
vocab_file: Optional[str]=None,
|
||||
merges_file: Optional[str]=None,
|
||||
add_prefix_space: boolean=False):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file))
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
|
||||
tokenizer.normalizer = NFKC.new()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.ByteLevel.new()
|
||||
|
||||
super().__init__(tokenizer)
|
Reference in New Issue
Block a user