mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Replace last BPETokenizer occurences
This commit is contained in:
@ -42,7 +42,7 @@ Start using in a matter of seconds:
|
|||||||
```python
|
```python
|
||||||
# Tokenizers provides ultra-fast implementations of most current tokenizers:
|
# Tokenizers provides ultra-fast implementations of most current tokenizers:
|
||||||
>>> from tokenizers import (ByteLevelBPETokenizer,
|
>>> from tokenizers import (ByteLevelBPETokenizer,
|
||||||
BPETokenizer,
|
CharBPETokenizer,
|
||||||
SentencePieceBPETokenizer,
|
SentencePieceBPETokenizer,
|
||||||
BertWordPieceTokenizer)
|
BertWordPieceTokenizer)
|
||||||
# Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU
|
# Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU
|
||||||
|
@ -55,7 +55,7 @@ console.log(wpEncoded.getTypeIds());
|
|||||||
|
|
||||||
## Provided Tokenizers
|
## Provided Tokenizers
|
||||||
|
|
||||||
- `BPETokenizer`: The original BPE
|
- `CharBPETokenizer`: The original BPE
|
||||||
- `ByteLevelBPETokenizer`: The byte level version of the BPE
|
- `ByteLevelBPETokenizer`: The byte level version of the BPE
|
||||||
- `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece
|
- `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece
|
||||||
- `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece
|
- `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece
|
||||||
|
@ -73,12 +73,12 @@ python setup.py install
|
|||||||
Using a pre-trained tokenizer is really simple:
|
Using a pre-trained tokenizer is really simple:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from tokenizers import BPETokenizer
|
from tokenizers import CharBPETokenizer
|
||||||
|
|
||||||
# Initialize a tokenizer
|
# Initialize a tokenizer
|
||||||
vocab = "./path/to/vocab.json"
|
vocab = "./path/to/vocab.json"
|
||||||
merges = "./path/to/merges.txt"
|
merges = "./path/to/merges.txt"
|
||||||
tokenizer = BPETokenizer(vocab, merges)
|
tokenizer = CharBPETokenizer(vocab, merges)
|
||||||
|
|
||||||
# And then encode:
|
# And then encode:
|
||||||
encoded = tokenizer.encode("I can feel the magic, can you?")
|
encoded = tokenizer.encode("I can feel the magic, can you?")
|
||||||
@ -89,10 +89,10 @@ print(encoded.tokens)
|
|||||||
And you can train yours just as simply:
|
And you can train yours just as simply:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from tokenizers import BPETokenizer
|
from tokenizers import CharBPETokenizer
|
||||||
|
|
||||||
# Initialize a tokenizer
|
# Initialize a tokenizer
|
||||||
tokenizer = BPETokenizer()
|
tokenizer = CharBPETokenizer()
|
||||||
|
|
||||||
# Then train it!
|
# Then train it!
|
||||||
tokenizer.train([ "./path/to/files/1.txt", "./path/to/files/2.txt" ])
|
tokenizer.train([ "./path/to/files/1.txt", "./path/to/files/2.txt" ])
|
||||||
@ -106,7 +106,7 @@ tokenizer.save("./path/to/directory", "my-bpe")
|
|||||||
|
|
||||||
### Provided Tokenizers
|
### Provided Tokenizers
|
||||||
|
|
||||||
- `BPETokenizer`: The original BPE
|
- `CharBPETokenizer`: The original BPE
|
||||||
- `ByteLevelBPETokenizer`: The byte level version of the BPE
|
- `ByteLevelBPETokenizer`: The byte level version of the BPE
|
||||||
- `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece
|
- `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece
|
||||||
- `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece
|
- `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece
|
||||||
|
@ -7,7 +7,7 @@ from .trainers import *
|
|||||||
|
|
||||||
from .implementations import (
|
from .implementations import (
|
||||||
ByteLevelBPETokenizer as ByteLevelBPETokenizer,
|
ByteLevelBPETokenizer as ByteLevelBPETokenizer,
|
||||||
BPETokenizer as BPETokenizer,
|
CharBPETokenizer as CharBPETokenizer,
|
||||||
SentencePieceBPETokenizer as SentencePieceBPETokenizer,
|
SentencePieceBPETokenizer as SentencePieceBPETokenizer,
|
||||||
BertWordPieceTokenizer as BertWordPieceTokenizer,
|
BertWordPieceTokenizer as BertWordPieceTokenizer,
|
||||||
)
|
)
|
||||||
|
Reference in New Issue
Block a user