mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-03 15:59:25 +00:00
Add the original BPETokenizer
This commit is contained in:
@ -7,4 +7,4 @@ from .tokenizers import normalizers
|
||||
from .tokenizers import pre_tokenizers
|
||||
from .tokenizers import processors
|
||||
from .tokenizers import trainers
|
||||
from .implementations import ByteLevelBPETokenizer
|
||||
from .implementations import ByteLevelBPETokenizer, BPETokenizer
|
||||
|
@ -5,7 +5,7 @@ from tokenizers import pre_tokenizers
|
||||
from tokenizers import processors
|
||||
from tokenizers import trainers
|
||||
|
||||
from tokenizers.implementations import ByteLevelBPETokenizer
|
||||
from tokenizers.implementations import ByteLevelBPETokenizer, BPETokenizer
|
||||
|
||||
from typing import Optional, Union, List, Tuple
|
||||
|
||||
|
@ -1,2 +1,3 @@
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
from .byte_level_bpe import ByteLevelBPETokenizer
|
||||
from .bpe import BPETokenizer
|
||||
|
36
bindings/python/tokenizers/implementations/bpe.py
Normal file
36
bindings/python/tokenizers/implementations/bpe.py
Normal file
@ -0,0 +1,36 @@
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import NFKC, Sequence, Lowercase
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
|
||||
from typing import Optional
|
||||
|
||||
class BPETokenizer(BaseTokenizer):
|
||||
""" Original BPE Tokenizer
|
||||
|
||||
Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
vocab_file: Optional[str]=None,
|
||||
merges_file: Optional[str]=None,
|
||||
unk_token: Optional[str]="<unk>",
|
||||
suffix: Optional[str]="</w>",
|
||||
dropout: Optional[float]=None):
|
||||
if vocab_file is not None and merges_file is not None:
|
||||
tokenizer = Tokenizer(BPE.from_files(vocab_file,
|
||||
merges_file,
|
||||
dropout=dropout,
|
||||
unk_token=unk_token,
|
||||
end_of_word_suffix=suffix))
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
|
||||
tokenizer.normalizer = Sequence.new([
|
||||
NFKC.new(),
|
||||
Lowercase.new()
|
||||
])
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
||||
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
||||
|
||||
super().__init__(tokenizer)
|
Reference in New Issue
Block a user