Python - Update docs and implementations

This commit is contained in:
Anthony MOI
2020-03-04 16:47:05 -05:00
parent 2393506dc7
commit f1460fadb9
3 changed files with 12 additions and 8 deletions

View File

@ -121,7 +121,7 @@ you need together:
#### Use a pre-trained tokenizer
```python
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers
# Load a BPE Model
vocab = "./path/to/vocab.json"
@ -132,7 +132,8 @@ bpe = models.BPE.from_files(vocab, merges)
tokenizer = Tokenizer(bpe)
# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
# And then encode:
@ -157,7 +158,8 @@ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
tokenizer = Tokenizer(models.BPE.empty())
# Customize pre-tokenization and decoding
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
# And then train

View File

@ -60,8 +60,10 @@ if args.type == "gpt2":
# Create a Tokenizer using BPE
tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
# Use ByteLevel Normalizer
tok_r.normalizer = normalizers.ByteLevel(add_prefix_space=False)
# Use ByteLevel PreTokenizer
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tok_r.pre_tokenizer = pre_tokenizers.ByteLevel()
# Use ByteLevel Decoder
tok_r.decoder = decoders.ByteLevel()
elif args.type == "bert":

View File

@ -1,4 +1,4 @@
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, normalizers
from tokenizers.models import BPE
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
from .base_tokenizer import BaseTokenizer
@ -36,9 +36,9 @@ class ByteLevelBPETokenizer(BaseTokenizer):
else:
tokenizer = Tokenizer(BPE.empty())
# Check for Unicode normalization first (before everything else)
normalizers = []
normalizers = [normalizers.ByteLevel(add_prefix_space=add_prefix_space)]
# Check for Unicode normalization first (before everything else)
if unicode_normalizer:
normalizers += [unicode_normalizer_from_str(unicode_normalizer)]
@ -52,7 +52,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
else:
tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=add_prefix_space)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
parameters = {