Fix SentencePiece tokenizers conversion

2025-12-06 12:48:18 +00:00 · 2021-02-03 09:57:41 -05:00
parent fc0a50a272
commit 96b9972842
4 changed files with 33 additions and 45 deletions
--- a/bindings/python/scripts/spm_parity_check.py
+++ b/bindings/python/scripts/spm_parity_check.py
@@ -131,12 +131,12 @@ def check_diff(spm_diff, tok_diff, sp, tok):
    if spm_diff == list(reversed(tok_diff)):
        # AAA -> AA+A vs A+AA case.
        return True
-    # elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(
-    #     tok_diff
-    # ):
-    #     # Second order OK
-    #     # Barrich -> Barr + ich vs Bar + rich
-    #     return True
+    elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(
+        tok_diff
+    ):
+        # Second order OK
+        # Barrich -> Barr + ich vs Bar + rich
+        return True
    spm_reencoded = sp.encode(sp.decode(spm_diff))
    tok_reencoded = tok.encode(tok.decode(spm_diff)).ids
    if spm_reencoded != spm_diff and spm_reencoded == tok_reencoded:
@@ -265,7 +265,7 @@ def check_encode(args):
            else:
                perfect += 1

-            assert ids == encoded.ids, f"line {i}: {line} : {ids} != {encoded.ids}"
+            assert ids == encoded.ids, f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}"

    print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})")
    total = perfect + imperfect + wrong