Updating convert scripts with Replace normalizer.

2025-12-03 11:18:29 +00:00 · 2020-09-18 12:29:13 +02:00
parent c59b216baa
commit 033b98ce59
1 changed files with 3 additions and 7 deletions
--- a/bindings/python/scripts/convert.py
+++ b/bindings/python/scripts/convert.py
@@ -11,6 +11,7 @@ from tokenizers.normalizers import (
    Sequence,
    BertNormalizer,
    Precompiled,
+    Replace,
 )
 from tokenizers.pre_tokenizers import (
    Digits,
@@ -162,8 +163,7 @@ class AlbertConverter(SpmConverter):
        ]

    def normalizer(self, proto):
-        normalizers = []
-        # TODO Missing Replace quotes
+        normalizers = [Replace("``", '"'), Replace("''", '"')]
        if not self.original_tokenizer.keep_accents:
            normalizers.append(NFKD())
            normalizers.append(StripAccents())
@@ -299,8 +299,7 @@ class XLNetConverter(SpmConverter):
        ]

    def normalizer(self, proto):
-        # TODO Missing Replace quotes
-        normalizers = []
+        normalizers = [Replace("``", '"'), Replace("''", '"')]
        if not self.original_tokenizer.keep_accents:
            normalizers.append(NFKD())
            normalizers.append(StripAccents())
@@ -384,9 +383,6 @@ def check(pretrained, filename):
        for i, line in enumerate(f):
            line = line.strip()

-            # TODO in normalizer
-            line = line.replace("``", '"').replace("''", '"')
-
            start = now()
            ids = transformer_tokenizer.encode(line)
            trans = now()