mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add SentencePieceBPETokenizer
This commit is contained in:
@ -7,4 +7,4 @@ from .tokenizers import normalizers
|
||||
from .tokenizers import pre_tokenizers
|
||||
from .tokenizers import processors
|
||||
from .tokenizers import trainers
|
||||
from .implementations import ByteLevelBPETokenizer, BPETokenizer
|
||||
from .implementations import ByteLevelBPETokenizer, BPETokenizer, SentencePieceBPETokenizer
|
||||
|
@ -5,7 +5,7 @@ from tokenizers import pre_tokenizers
|
||||
from tokenizers import processors
|
||||
from tokenizers import trainers
|
||||
|
||||
from tokenizers.implementations import ByteLevelBPETokenizer, BPETokenizer
|
||||
from tokenizers.implementations import ByteLevelBPETokenizer, BPETokenizer, SentencePieceBPETokenizer
|
||||
|
||||
from typing import Optional, Union, List, Tuple
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
from .byte_level_bpe import ByteLevelBPETokenizer
|
||||
from .bpe import BPETokenizer
|
||||
from .sentencepiece_bpe import SentencePieceBPETokenizer
|
||||
|
@ -0,0 +1,35 @@
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional
|
||||
|
||||
class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
""" SentencePiece BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, with the pretokenization used by SentencePiece
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
vocab_file: Optional[str]=None,
|
||||
merges_file: Optional[str]=None,
|
||||
unk_token: str="<unk>",
|
||||
replacement: str="▁",
|
||||
add_prefix_space: bool=True,
|
||||
dropout: Optional[float]=None):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(BPE.from_files(vocab_file,
|
||||
merges_file,
|
||||
dropout=dropout,
|
||||
unk_token=unk_token))
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
|
||||
tokenizer.normalizer = NFKC.new()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(replacement=replacement,
|
||||
add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.Metaspace.new(replacement=replacement,
|
||||
add_prefix_space=add_prefix_space)
|
||||
|
||||
super().__init__(tokenizer)
|
Reference in New Issue
Block a user