mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Replace last BPETokenizer occurences
This commit is contained in:
@ -42,7 +42,7 @@ Start using in a matter of seconds:
|
||||
```python
|
||||
# Tokenizers provides ultra-fast implementations of most current tokenizers:
|
||||
>>> from tokenizers import (ByteLevelBPETokenizer,
|
||||
BPETokenizer,
|
||||
CharBPETokenizer,
|
||||
SentencePieceBPETokenizer,
|
||||
BertWordPieceTokenizer)
|
||||
# Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU
|
||||
|
@ -55,7 +55,7 @@ console.log(wpEncoded.getTypeIds());
|
||||
|
||||
## Provided Tokenizers
|
||||
|
||||
- `BPETokenizer`: The original BPE
|
||||
- `CharBPETokenizer`: The original BPE
|
||||
- `ByteLevelBPETokenizer`: The byte level version of the BPE
|
||||
- `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece
|
||||
- `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece
|
||||
|
@ -73,12 +73,12 @@ python setup.py install
|
||||
Using a pre-trained tokenizer is really simple:
|
||||
|
||||
```python
|
||||
from tokenizers import BPETokenizer
|
||||
from tokenizers import CharBPETokenizer
|
||||
|
||||
# Initialize a tokenizer
|
||||
vocab = "./path/to/vocab.json"
|
||||
merges = "./path/to/merges.txt"
|
||||
tokenizer = BPETokenizer(vocab, merges)
|
||||
tokenizer = CharBPETokenizer(vocab, merges)
|
||||
|
||||
# And then encode:
|
||||
encoded = tokenizer.encode("I can feel the magic, can you?")
|
||||
@ -89,10 +89,10 @@ print(encoded.tokens)
|
||||
And you can train yours just as simply:
|
||||
|
||||
```python
|
||||
from tokenizers import BPETokenizer
|
||||
from tokenizers import CharBPETokenizer
|
||||
|
||||
# Initialize a tokenizer
|
||||
tokenizer = BPETokenizer()
|
||||
tokenizer = CharBPETokenizer()
|
||||
|
||||
# Then train it!
|
||||
tokenizer.train([ "./path/to/files/1.txt", "./path/to/files/2.txt" ])
|
||||
@ -106,7 +106,7 @@ tokenizer.save("./path/to/directory", "my-bpe")
|
||||
|
||||
### Provided Tokenizers
|
||||
|
||||
- `BPETokenizer`: The original BPE
|
||||
- `CharBPETokenizer`: The original BPE
|
||||
- `ByteLevelBPETokenizer`: The byte level version of the BPE
|
||||
- `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece
|
||||
- `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece
|
||||
|
@ -7,7 +7,7 @@ from .trainers import *
|
||||
|
||||
from .implementations import (
|
||||
ByteLevelBPETokenizer as ByteLevelBPETokenizer,
|
||||
BPETokenizer as BPETokenizer,
|
||||
CharBPETokenizer as CharBPETokenizer,
|
||||
SentencePieceBPETokenizer as SentencePieceBPETokenizer,
|
||||
BertWordPieceTokenizer as BertWordPieceTokenizer,
|
||||
)
|
||||
|
Reference in New Issue
Block a user