Fix ByteLevel Decoder

The join was done after replacing bytes and building subwords, which was preventing bytes across these subwords to be merged correctly. We need to join first.
2025-08-24 00:59:19 +00:00 · 2019-11-21 16:50:25 -05:00
parent 634415c098
commit 663644e041
2 changed files with 41 additions and 14 deletions
--- a/bindings/python/example.py
+++ b/bindings/python/example.py
@ -58,17 +58,26 @@ print(f"Tokenizing {len(text)} lines")
 start = time.time()
 encoded_r = tokenize_r()
 end = time.time()
-print(f"Rust tokenizer took: {end - start} sec")
+time_r = end - start
+print(f"Rust tokenizer took: {time_r} sec")

 # Python version
 start = time.time()
 encoded_p = tokenize_p()
 end = time.time()
-print(f"Transformer tokenizer took: {end - start} sec")
+time_p = end - start
+print(f"Transformer tokenizer took: {time_p} sec")
+
+print(f"SpeedUp Ratio: {time_p / time_r}")

 ids_r = [ [ token.id for token in sentence ] for sentence in encoded_r ]
 assert(ids_r == encoded_p)

 decoded_r = tok_r.decode_batch(ids_r)
-print(f"Decoded sentences: {decoded_r}")
+for i in range(0, len(text)):
+    if decoded_r[i] != text[i]:
+        print(decoded_r[i])
+        print(text[i])
+        print("")
+
 assert(decoded_r == text)