mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Update bindings and typings
This commit is contained in:
@ -134,7 +134,7 @@ tokenizer = Tokenizer(bpe)
|
||||
# Customize pre-tokenization and decoding
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
|
||||
|
||||
# And then encode:
|
||||
encoded = tokenizer.encode("I can feel the magic, can you?")
|
||||
@ -160,7 +160,7 @@ tokenizer = Tokenizer(models.BPE.empty())
|
||||
# Customize pre-tokenization and decoding
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel()
|
||||
tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
|
||||
|
||||
# And then train
|
||||
trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)
|
||||
|
Reference in New Issue
Block a user