Add the original BPETokenizer

This commit is contained in:
Anthony MOI
2020-01-07 19:58:48 -05:00
parent 243a45af40
commit ee115df65e
4 changed files with 39 additions and 2 deletions

View File

@ -7,4 +7,4 @@ from .tokenizers import normalizers
from .tokenizers import pre_tokenizers
from .tokenizers import processors
from .tokenizers import trainers
from .implementations import ByteLevelBPETokenizer
from .implementations import ByteLevelBPETokenizer, BPETokenizer

View File

@ -5,7 +5,7 @@ from tokenizers import pre_tokenizers
from tokenizers import processors
from tokenizers import trainers
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer, BPETokenizer
from typing import Optional, Union, List, Tuple

View File

@ -1,2 +1,3 @@
from .base_tokenizer import BaseTokenizer
from .byte_level_bpe import ByteLevelBPETokenizer
from .bpe import BPETokenizer

View File

@ -0,0 +1,36 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC, Sequence, Lowercase
from .base_tokenizer import BaseTokenizer
from typing import Optional
class BPETokenizer(BaseTokenizer):
""" Original BPE Tokenizer
Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
"""
def __init__(self,
vocab_file: Optional[str]=None,
merges_file: Optional[str]=None,
unk_token: Optional[str]="<unk>",
suffix: Optional[str]="</w>",
dropout: Optional[float]=None):
if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer(BPE.from_files(vocab_file,
merges_file,
dropout=dropout,
unk_token=unk_token,
end_of_word_suffix=suffix))
else:
tokenizer = Tokenizer(BPE.empty())
tokenizer.normalizer = Sequence.new([
NFKC.new(),
Lowercase.new()
])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
super().__init__(tokenizer)