Fix SPM conversions (#686)

* Fix SPM conversions

* Update changelog

Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
Lysandre Debut
2021-05-20 15:55:55 +02:00
committed by GitHub
parent 2e2e7558f7
commit 4b0dc6b947
2 changed files with 7 additions and 2 deletions

View File

@ -131,7 +131,7 @@ class AlbertConverter(SpmConverter):
]
def normalizer(self, proto):
normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")]
normalizers = [Replace("``", '"'), Replace("''", '"')]
if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD())
normalizers.append(StripAccents())
@ -140,6 +140,7 @@ class AlbertConverter(SpmConverter):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
normalizers.append(Precompiled(precompiled_charsmap))
normalizers.append(Replace(Regex(" {2,}"), " "))
return Sequence(normalizers)
def post_processor(self, tokenizer):
@ -267,7 +268,7 @@ class XLNetConverter(SpmConverter):
]
def normalizer(self, proto):
normalizers = [Replace("``", '"'), Replace("''", '"'), Replace(Regex(" {2,}"), " ")]
normalizers = [Replace("``", '"'), Replace("''", '"')]
if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD())
normalizers.append(StripAccents())
@ -276,6 +277,7 @@ class XLNetConverter(SpmConverter):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
normalizers.append(Precompiled(precompiled_charsmap))
normalizers.append(Replace(Regex(" {2,}"), " "))
return Sequence(normalizers)
def post_processor(self, tokenizer):