mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-04 00:09:34 +00:00
Add the original BPETokenizer
This commit is contained in:
@ -7,4 +7,4 @@ from .tokenizers import normalizers
|
|||||||
from .tokenizers import pre_tokenizers
|
from .tokenizers import pre_tokenizers
|
||||||
from .tokenizers import processors
|
from .tokenizers import processors
|
||||||
from .tokenizers import trainers
|
from .tokenizers import trainers
|
||||||
from .implementations import ByteLevelBPETokenizer
|
from .implementations import ByteLevelBPETokenizer, BPETokenizer
|
||||||
|
@ -5,7 +5,7 @@ from tokenizers import pre_tokenizers
|
|||||||
from tokenizers import processors
|
from tokenizers import processors
|
||||||
from tokenizers import trainers
|
from tokenizers import trainers
|
||||||
|
|
||||||
from tokenizers.implementations import ByteLevelBPETokenizer
|
from tokenizers.implementations import ByteLevelBPETokenizer, BPETokenizer
|
||||||
|
|
||||||
from typing import Optional, Union, List, Tuple
|
from typing import Optional, Union, List, Tuple
|
||||||
|
|
||||||
|
@ -1,2 +1,3 @@
|
|||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
from .byte_level_bpe import ByteLevelBPETokenizer
|
from .byte_level_bpe import ByteLevelBPETokenizer
|
||||||
|
from .bpe import BPETokenizer
|
||||||
|
36
bindings/python/tokenizers/implementations/bpe.py
Normal file
36
bindings/python/tokenizers/implementations/bpe.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
from tokenizers import Tokenizer, pre_tokenizers, decoders
|
||||||
|
from tokenizers.models import BPE
|
||||||
|
from tokenizers.normalizers import NFKC, Sequence, Lowercase
|
||||||
|
from .base_tokenizer import BaseTokenizer
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
class BPETokenizer(BaseTokenizer):
|
||||||
|
""" Original BPE Tokenizer
|
||||||
|
|
||||||
|
Represents the BPE algorithm, as introduced by Rico Sennrich (https://arxiv.org/abs/1508.07909)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
vocab_file: Optional[str]=None,
|
||||||
|
merges_file: Optional[str]=None,
|
||||||
|
unk_token: Optional[str]="<unk>",
|
||||||
|
suffix: Optional[str]="</w>",
|
||||||
|
dropout: Optional[float]=None):
|
||||||
|
if vocab_file is not None and merges_file is not None:
|
||||||
|
tokenizer = Tokenizer(BPE.from_files(vocab_file,
|
||||||
|
merges_file,
|
||||||
|
dropout=dropout,
|
||||||
|
unk_token=unk_token,
|
||||||
|
end_of_word_suffix=suffix))
|
||||||
|
else:
|
||||||
|
tokenizer = Tokenizer(BPE.empty())
|
||||||
|
|
||||||
|
tokenizer.normalizer = Sequence.new([
|
||||||
|
NFKC.new(),
|
||||||
|
Lowercase.new()
|
||||||
|
])
|
||||||
|
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
||||||
|
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
||||||
|
|
||||||
|
super().__init__(tokenizer)
|
Reference in New Issue
Block a user