Add the original BPETokenizer

This commit is contained in:
Anthony MOI
2020-01-07 19:58:48 -05:00
parent 243a45af40
commit ee115df65e
4 changed files with 39 additions and 2 deletions

View File

@ -7,4 +7,4 @@ from .tokenizers import normalizers
from .tokenizers import pre_tokenizers from .tokenizers import pre_tokenizers
from .tokenizers import processors from .tokenizers import processors
from .tokenizers import trainers from .tokenizers import trainers
from .implementations import ByteLevelBPETokenizer from .implementations import ByteLevelBPETokenizer, BPETokenizer

View File

@ -5,7 +5,7 @@ from tokenizers import pre_tokenizers
from tokenizers import processors from tokenizers import processors
from tokenizers import trainers from tokenizers import trainers
from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.implementations import ByteLevelBPETokenizer, BPETokenizer
from typing import Optional, Union, List, Tuple from typing import Optional, Union, List, Tuple

View File

@ -1,2 +1,3 @@
from .base_tokenizer import BaseTokenizer from .base_tokenizer import BaseTokenizer
from .byte_level_bpe import ByteLevelBPETokenizer from .byte_level_bpe import ByteLevelBPETokenizer
from .bpe import BPETokenizer

View File

@ -0,0 +1,36 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC, Sequence, Lowercase
from .base_tokenizer import BaseTokenizer
from typing import Optional
class BPETokenizer(BaseTokenizer):
""" Original BPE Tokenizer
Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
"""
def __init__(self,
vocab_file: Optional[str]=None,
merges_file: Optional[str]=None,
unk_token: Optional[str]="<unk>",
suffix: Optional[str]="</w>",
dropout: Optional[float]=None):
if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer(BPE.from_files(vocab_file,
merges_file,
dropout=dropout,
unk_token=unk_token,
end_of_word_suffix=suffix))
else:
tokenizer = Tokenizer(BPE.empty())
tokenizer.normalizer = Sequence.new([
NFKC.new(),
Lowercase.new()
])
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
super().__init__(tokenizer)