mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Python - More updates to the new API
This commit is contained in:
@ -132,8 +132,8 @@ bpe = models.BPE.from_files(vocab, merges)
|
|||||||
tokenizer = Tokenizer(bpe)
|
tokenizer = Tokenizer(bpe)
|
||||||
|
|
||||||
# Customize pre-tokenization and decoding
|
# Customize pre-tokenization and decoding
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=True)
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||||
tokenizer.decoder = decoders.ByteLevel.new()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
|
|
||||||
# And then encode:
|
# And then encode:
|
||||||
encoded = tokenizer.encode("I can feel the magic, can you?")
|
encoded = tokenizer.encode("I can feel the magic, can you?")
|
||||||
@ -157,11 +157,11 @@ from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
|
|||||||
tokenizer = Tokenizer(models.BPE.empty())
|
tokenizer = Tokenizer(models.BPE.empty())
|
||||||
|
|
||||||
# Customize pre-tokenization and decoding
|
# Customize pre-tokenization and decoding
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=True)
|
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
|
||||||
tokenizer.decoder = decoders.ByteLevel.new()
|
tokenizer.decoder = decoders.ByteLevel()
|
||||||
|
|
||||||
# And then train
|
# And then train
|
||||||
trainer = trainers.BpeTrainer.new(vocab_size=20000, min_frequency=2)
|
trainer = trainers.BpeTrainer(vocab_size=20000, min_frequency=2)
|
||||||
tokenizer.train(trainer, [
|
tokenizer.train(trainer, [
|
||||||
"./path/to/dataset/1.txt",
|
"./path/to/dataset/1.txt",
|
||||||
"./path/to/dataset/2.txt",
|
"./path/to/dataset/2.txt",
|
||||||
|
@ -76,7 +76,7 @@ elif args.type == "bert":
|
|||||||
strip_accents=True,
|
strip_accents=True,
|
||||||
lowercase=True,
|
lowercase=True,
|
||||||
)
|
)
|
||||||
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace.new()
|
# tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
|
||||||
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
|
||||||
tok_r.decoder = decoders.WordPiece()
|
tok_r.decoder = decoders.WordPiece()
|
||||||
tok_r.post_processor = BertProcessing(
|
tok_r.post_processor = BertProcessing(
|
||||||
|
Reference in New Issue
Block a user