Enabling training parity check for tokenizers.UnigramTrainer

This commit is contained in:
Nicolas Patry
2020-09-01 18:35:22 +02:00
committed by Anthony MOI
parent 558e76f18e
commit ee3860c029
2 changed files with 59 additions and 2 deletions

View File

@ -15,14 +15,71 @@ def main():
parser.add_argument(
"--vocab-size", "-v", type=int, default=8000, help="Vocab size for spm_train",
)
parser.add_argument(
"--train",
action="store_true",
help="Instead of checking the encoder part, we check the trainer part",
)
args = parser.parse_args()
spm.SentencePieceTrainer.Train(
f"--input={args.input_file} --model_prefix={args.model_prefix}"
f" --character_coverage=1.0"
f" --max_sentence_length=40000"
f" --num_threads=1"
f" --vocab_size={args.vocab_size}"
)
if args.train:
check_train(args)
else:
check_encode(args)
def check_train(args):
sp = spm.SentencePieceProcessor()
model_filename = f"{args.model_prefix}.model"
sp.Load(model_filename)
tokenizer = tokenizers.SentencePieceUnigramTokenizer()
tokenizer.train(args.input_file, show_progress=False)
spm_tokens = 0
tokenizer_tokens = 0
with open(args.input_file, "r") as f:
for i, line in enumerate(f):
line = line.strip()
ids = sp.EncodeAsIds(line)
encoded = tokenizer.encode(line)
spm_tokens += len(ids)
tokenizer_tokens += len(encoded.ids)
vocab = [0 for i in range(args.vocab_size)]
spm_vocab = [0 for i in range(args.vocab_size)]
for token, index in tokenizer.get_vocab().items():
vocab[index] = token
for i in range(args.vocab_size):
spm_vocab[i] = sp.id_to_piece(i)
# 0 is unk in tokenizers, 0, 1, 2 are unk bos, eos in spm by default.
for i, (token, spm_token) in enumerate(zip(vocab[1:], spm_vocab[3:])):
if token != spm_token:
print(f"First different token is token {i} ({token} != {spm_token})")
break
print(f"Tokenizer used {tokenizer_tokens}, where spm used {spm_tokens}")
assert (
tokenizer_tokens < spm_tokens
), "Our trainer should be at least more efficient than the SPM one"
def check_encode(args):
sp = spm.SentencePieceProcessor()
model_filename = f"{args.model_prefix}.model"
sp.Load(model_filename)