Add SentencePieceBPETokenizer

This commit is contained in:
Anthony MOI
2020-01-07 20:30:15 -05:00
parent ee115df65e
commit 05f683ce23
4 changed files with 38 additions and 2 deletions

View File

@ -7,4 +7,4 @@ from .tokenizers import normalizers
from .tokenizers import pre_tokenizers
from .tokenizers import processors
from .tokenizers import trainers
from .implementations import ByteLevelBPETokenizer, BPETokenizer
from .implementations import ByteLevelBPETokenizer, BPETokenizer, SentencePieceBPETokenizer

View File

@ -5,7 +5,7 @@ from tokenizers import pre_tokenizers
from tokenizers import processors
from tokenizers import trainers
from tokenizers.implementations import ByteLevelBPETokenizer, BPETokenizer
from tokenizers.implementations import ByteLevelBPETokenizer, BPETokenizer, SentencePieceBPETokenizer
from typing import Optional, Union, List, Tuple

View File

@ -1,3 +1,4 @@
from .base_tokenizer import BaseTokenizer
from .byte_level_bpe import ByteLevelBPETokenizer
from .bpe import BPETokenizer
from .sentencepiece_bpe import SentencePieceBPETokenizer

View File

@ -0,0 +1,35 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders
from tokenizers.models import BPE
from tokenizers.normalizers import NFKC
from .base_tokenizer import BaseTokenizer
from typing import Optional
class SentencePieceBPETokenizer(BaseTokenizer):
""" SentencePiece BPE Tokenizer
Represents the BPE algorithm, with the pretokenization used by SentencePiece
"""
def __init__(self,
vocab_file: Optional[str]=None,
merges_file: Optional[str]=None,
unk_token: str="<unk>",
replacement: str="",
add_prefix_space: bool=True,
dropout: Optional[float]=None):
if vocab_file is not None and merges_file is not None:
tokenizer = Tokenizer(BPE.from_files(vocab_file,
merges_file,
dropout=dropout,
unk_token=unk_token))
else:
tokenizer = Tokenizer(BPE.empty())
tokenizer.normalizer = NFKC.new()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(replacement=replacement,
add_prefix_space=add_prefix_space)
tokenizer.decoder = decoders.Metaspace.new(replacement=replacement,
add_prefix_space=add_prefix_space)
super().__init__(tokenizer)