Python - Update README and implementation

This commit is contained in:
Anthony MOI
2020-03-05 17:26:53 -05:00
parent 52180a9179
commit d778ed5e0a
2 changed files with 6 additions and 3 deletions

View File

@@ -121,7 +121,7 @@ you need together:
#### Use a pre-trained tokenizer
```python
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers, processors
# Load a BPE Model
vocab = "./path/to/vocab.json"
@@ -135,6 +135,7 @@ tokenizer = Tokenizer(bpe)
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
# And then encode:
encoded = tokenizer.encode("I can feel the magic, can you?")
@@ -152,7 +153,7 @@ print(encoded)
#### Train a new tokenizer
```python
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE.empty())
@@ -161,6 +162,7 @@ tokenizer = Tokenizer(models.BPE.empty())
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
# And then train
trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)

View File

@@ -1,5 +1,5 @@
import tokenizers
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, processors
from tokenizers.models import BPE
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
from .base_tokenizer import BaseTokenizer
@@ -55,6 +55,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.ByteLevel()
parameters = {
"model": "ByteLevelBPE",