Now spm_parity_check succeeds because we have the correct pre_tokenizer.

2025-12-05 20:28:22 +00:00 · 2020-08-24 14:05:34 +02:00
parent e974cfb1c9
commit dd91739ba0
2 changed files with 8 additions and 17 deletions
--- a/bindings/python/scripts/spm_parity_check.py
+++ b/bindings/python/scripts/spm_parity_check.py
@@ -46,21 +46,7 @@ def main():

            if ids != encoded.ids:
                # Encoding can be the same with same result AAA -> A + AA vs AA + A
-                # We just check this does not cover unk tokens
-                if len(ids) != len(encoded.ids):
-                    N = len(ids)
-                    M = len(encoded.ids)
-                    first_index_error = [i for i in range(min(N, M)) if ids[i] != encoded.ids[i]][0]
-                    last_index_error = [
-                        min(N, M) - i
-                        for i in range(min(N, M))
-                        if ids[-i - 1] != encoded.ids[-i - 1]
-                    ][0]
-                    print(ids[first_index_error : last_index_error + 1])
-                    print(encoded.ids[first_index_error : last_index_error + 1])
-                    import ipdb
-
-                    ipdb.set_trace()
+                # We can check that we use at least exactly the same number of tokens.
                assert len(ids) == len(encoded.ids)
                continue