Updating convert scripts with Replace normalizer.

This commit is contained in:
Nicolas Patry
2020-09-18 12:29:13 +02:00
parent c59b216baa
commit 033b98ce59

View File

@@ -11,6 +11,7 @@ from tokenizers.normalizers import (
Sequence, Sequence,
BertNormalizer, BertNormalizer,
Precompiled, Precompiled,
Replace,
) )
from tokenizers.pre_tokenizers import ( from tokenizers.pre_tokenizers import (
Digits, Digits,
@@ -162,8 +163,7 @@ class AlbertConverter(SpmConverter):
] ]
def normalizer(self, proto): def normalizer(self, proto):
normalizers = [] normalizers = [Replace("``", '"'), Replace("''", '"')]
# TODO Missing Replace quotes
if not self.original_tokenizer.keep_accents: if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD()) normalizers.append(NFKD())
normalizers.append(StripAccents()) normalizers.append(StripAccents())
@@ -299,8 +299,7 @@ class XLNetConverter(SpmConverter):
] ]
def normalizer(self, proto): def normalizer(self, proto):
# TODO Missing Replace quotes normalizers = [Replace("``", '"'), Replace("''", '"')]
normalizers = []
if not self.original_tokenizer.keep_accents: if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD()) normalizers.append(NFKD())
normalizers.append(StripAccents()) normalizers.append(StripAccents())
@@ -384,9 +383,6 @@ def check(pretrained, filename):
for i, line in enumerate(f): for i, line in enumerate(f):
line = line.strip() line = line.strip()
# TODO in normalizer
line = line.replace("``", '"').replace("''", '"')
start = now() start = now()
ids = transformer_tokenizer.encode(line) ids = transformer_tokenizer.encode(line)
trans = now() trans = now()