mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-31 04:29:21 +00:00
Improve python readme with training example
This commit is contained in:
@ -6,6 +6,9 @@ A fast and easy to use implementation of today's most used tokenizers.
|
|||||||
|
|
||||||
- High Level design: [master](https://github.com/huggingface/tokenizers)
|
- High Level design: [master](https://github.com/huggingface/tokenizers)
|
||||||
|
|
||||||
|
This API is currently in the process of being stabilized. We might introduce breaking changes
|
||||||
|
really often in the coming days/weeks, so use at your own risks.
|
||||||
|
|
||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
#### With pip:
|
#### With pip:
|
||||||
@ -41,6 +44,8 @@ maturin develop --release
|
|||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
|
#### Use a pre-trained tokenizer
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders
|
||||||
|
|
||||||
@ -56,7 +61,7 @@ tokenizer = Tokenizer(bpe)
|
|||||||
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new())
|
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new())
|
||||||
tokenizer.with_decoder(decoders.ByteLevel.new())
|
tokenizer.with_decoder(decoders.ByteLevel.new())
|
||||||
|
|
||||||
# And then tokenize:
|
# And then encode:
|
||||||
encoded = tokenizer.encode("I can feel the magic, can you?")
|
encoded = tokenizer.encode("I can feel the magic, can you?")
|
||||||
print(encoded)
|
print(encoded)
|
||||||
|
|
||||||
@ -67,3 +72,28 @@ encoded = tokenizer.encode_batch([
|
|||||||
])
|
])
|
||||||
print(encoded)
|
print(encoded)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
#### Train a new tokenizer
|
||||||
|
|
||||||
|
```python
|
||||||
|
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
|
||||||
|
|
||||||
|
# Initialize a tokenizer
|
||||||
|
tokenizer = Tokenizer(models.BPE.empty())
|
||||||
|
|
||||||
|
# Customize pre-tokenization and decoding
|
||||||
|
tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new())
|
||||||
|
tokenizer.with_decoder(decoders.ByteLevel.new())
|
||||||
|
|
||||||
|
# And then train
|
||||||
|
trainer = trainers.BpeTrainer.new(vocab_size=20000, min_frequency=2)
|
||||||
|
tokenizer.train(trainer, [
|
||||||
|
"./path/to/dataset/1.txt",
|
||||||
|
"./path/to/dataset/2.txt",
|
||||||
|
"./path/to/dataset/3.txt"
|
||||||
|
])
|
||||||
|
|
||||||
|
# Now we can encode
|
||||||
|
encoded = tokenizer.encode("I can feel the magic, can you?")
|
||||||
|
print(encoded)
|
||||||
|
```
|
||||||
|
Reference in New Issue
Block a user