mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-12 21:38:35 +00:00
Python - Update README and implementation
This commit is contained in:
@@ -121,7 +121,7 @@ you need together:
|
|||||||
#### Use a pre-trained tokenizer
|
#### Use a pre-trained tokenizer
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, normalizers, processors
|
||||||
|
|
||||||
# Load a BPE Model
|
# Load a BPE Model
|
||||||
vocab = "./path/to/vocab.json"
|
vocab = "./path/to/vocab.json"
|
||||||
@@ -135,6 +135,7 @@ tokenizer = Tokenizer(bpe)
|
|||||||
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
|
tokenizer.post_processor = processors.ByteLevel()
|
||||||
|
|
||||||
# And then encode:
|
# And then encode:
|
||||||
encoded = tokenizer.encode("I can feel the magic, can you?")
|
encoded = tokenizer.encode("I can feel the magic, can you?")
|
||||||
@@ -152,7 +153,7 @@ print(encoded)
|
|||||||
#### Train a new tokenizer
|
#### Train a new tokenizer
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors
|
||||||
|
|
||||||
# Initialize a tokenizer
|
# Initialize a tokenizer
|
||||||
tokenizer = Tokenizer(models.BPE.empty())
|
tokenizer = Tokenizer(models.BPE.empty())
|
||||||
@@ -161,6 +162,7 @@ tokenizer = Tokenizer(models.BPE.empty())
|
|||||||
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
tokenizer.normalizer = normalizers.ByteLevel(add_prefix_space=True)
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
|
tokenizer.post_processor = processors.ByteLevel()
|
||||||
|
|
||||||
# And then train
|
# And then train
|
||||||
trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)
|
trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
import tokenizers
|
import tokenizers
|
||||||
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers
|
from tokenizers import Tokenizer, pre_tokenizers, decoders, trainers, processors
|
||||||
from tokenizers.models import BPE
|
from tokenizers.models import BPE
|
||||||
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
from tokenizers.normalizers import unicode_normalizer_from_str, Lowercase, Sequence
|
||||||
from .base_tokenizer import BaseTokenizer
|
from .base_tokenizer import BaseTokenizer
|
||||||
@@ -55,6 +55,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
|||||||
|
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
|
tokenizer.post_processor = processors.ByteLevel()
|
||||||
|
|
||||||
parameters = {
|
parameters = {
|
||||||
"model": "ByteLevelBPE",
|
"model": "ByteLevelBPE",
|
||||||
|
|||||||
Reference in New Issue
Block a user