mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Update docs and implementations
This commit is contained in:
@ -121,7 +121,7 @@ you need together:
|
||||
#### Use a pre-trained tokenizer
|
||||
|
||||
```python
|
||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
|
||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers
|
||||
|
||||
# Load a BPE Model
|
||||
vocab = "./path/to/vocab.json"
|
||||
@ -132,7 +132,8 @@ bpe = models.BPE.from_files(vocab, merges)
|
||||
tokenizer = Tokenizer(bpe)
|
||||
|
||||
# Customize pre-tokenization and decoding
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
# And then encode:
|
||||
@ -157,7 +158,8 @@ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
|
||||
tokenizer = Tokenizer(models.BPE.empty())
|
||||
|
||||
# Customize pre-tokenization and decoding
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
# And then train
|
||||
|
@ -60,8 +60,10 @@ if args.type == "gpt2":
|
||||
|
||||
# Create a Tokenizer using BPE
|
||||
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
|
||||
# Use ByteLevel Normalizer
|
||||
tok_r.normalizer = normalizers.ByteLevel(add_prefix_space=False)
|
||||
# Use ByteLevel PreTokenizer
|
||||
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
|
||||
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
# Use ByteLevel Decoder
|
||||
tok_r.decoder = decoders.ByteLevel()
|
||||
elif args.type == "bert":
|
||||
|
@ -1,4 +1,4 @@
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
|
||||
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, normalizers
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||
from .base_tokenizer import BaseTokenizer
|
||||
@ -36,9 +36,9 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
else:
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
|
||||
# Check for Unicode normalization first (before everything else)
|
||||
normalizers = []
|
||||
normalizers = [normalizers.ByteLevel(add_prefix_space=add_prefix_space)]
|
||||
|
||||
# Check for Unicode normalization first (before everything else)
|
||||
if unicode_normalizer:
|
||||
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
|
||||
|
||||
@ -52,7 +52,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
parameters = {
|
||||
|
Reference in New Issue
Block a user