mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Fix a few errors in the README.md
Most notably, the convention for representing Python code (using ">>>" for code, without for output) was used the wrong way round.
This commit is contained in:
32
README.md
32
README.md
@ -19,7 +19,7 @@ versatility.
|
|||||||
|
|
||||||
## Main features:
|
## Main features:
|
||||||
|
|
||||||
- Train new vocabularies and tokenize, using todays most used tokenizers.
|
- Train new vocabularies and tokenize, using today's most used tokenizers.
|
||||||
- Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes
|
- Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes
|
||||||
less than 20 seconds to tokenize a GB of text on a server's CPU.
|
less than 20 seconds to tokenize a GB of text on a server's CPU.
|
||||||
- Easy to use, but also extremely versatile.
|
- Easy to use, but also extremely versatile.
|
||||||
@ -34,29 +34,29 @@ Start using in a matter of seconds:
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
# Tokenizers provides ultra-fast implementations of most current tokenizers:
|
# Tokenizers provides ultra-fast implementations of most current tokenizers:
|
||||||
from tokenizers import (ByteLevelBPETokenizer,
|
>>> from tokenizers import (ByteLevelBPETokenizer,
|
||||||
BPETokenizer,
|
BPETokenizer,
|
||||||
SentencePieceBPETokenizer,
|
SentencePieceBPETokenizer,
|
||||||
BertWordPieceTokenizer)
|
BertWordPieceTokenizer)
|
||||||
# Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU
|
# Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU
|
||||||
# Tokenizers can be easily instantiated from standard files
|
# Tokenizers can be easily instantiated from standard files
|
||||||
tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
|
>>> tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
|
||||||
>>> Tokenizer(vocabulary_size=30522, model=BertWordPiece, add_special_tokens=True, unk_token=[UNK],
|
Tokenizer(vocabulary_size=30522, model=BertWordPiece, add_special_tokens=True, unk_token=[UNK],
|
||||||
sep_token=[SEP], cls_token=[CLS], clean_text=True, handle_chinese_chars=True,
|
sep_token=[SEP], cls_token=[CLS], clean_text=True, handle_chinese_chars=True,
|
||||||
strip_accents=True, lowercase=True, wordpieces_prefix=##)
|
strip_accents=True, lowercase=True, wordpieces_prefix=##)
|
||||||
|
|
||||||
# Tokenizers provide exhaustive outputs: tokens, mapping to original string, attention/special token masks.
|
# Tokenizers provide exhaustive outputs: tokens, mapping to original string, attention/special token masks.
|
||||||
# They also handle model's max input lengths as well as padding (to directly encode in padded batches)
|
# They also handle model's max input lengths as well as padding (to directly encode in padded batches)
|
||||||
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
>>> output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
||||||
>>> Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])
|
Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing, original_str, normalized_str])
|
||||||
print(output.ids, output.tokens, output.offsets)
|
>>> print(output.ids, output.tokens, output.offsets)
|
||||||
>>> [101, 7592, 1010, 1061, 1005, 2035, 999, 2129, 2024, 2017, 100, 1029, 102]
|
[101, 7592, 1010, 1061, 1005, 2035, 999, 2129, 2024, 2017, 100, 1029, 102]
|
||||||
>>> ['[CLS]', 'hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '[UNK]', '?', '[SEP]']
|
['[CLS]', 'hello', ',', 'y', "'", 'all', '!', 'how', 'are', 'you', '[UNK]', '?', '[SEP]']
|
||||||
>>> [(0, 0), (0, 5), (5, 6), (7, 8 (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (26, 27),
|
[(0, 0), (0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (26, 27),
|
||||||
(28, 29), (0, 0)]
|
(28, 29), (0, 0)]
|
||||||
# Here is an example using the offsets mapping to retrieve the string coresponding to the 10th token:
|
# Here is an example using the offsets mapping to retrieve the string coresponding to the 10th token:
|
||||||
output.original_str[output.offsets[10]]
|
>>> output.original_str[output.offsets[10]]
|
||||||
>>> '😁'
|
'😁'
|
||||||
```
|
```
|
||||||
|
|
||||||
And training an new vocabulary is just as easy:
|
And training an new vocabulary is just as easy:
|
||||||
|
Reference in New Issue
Block a user