Moving StripAccents within normalizer for Albert +XLNet, but now crash

in Precompiled. offsets are wrong ?
This commit is contained in:
Nicolas Patry
2020-09-17 10:40:34 +02:00
parent 275ee6d4c4
commit b16406c900
2 changed files with 12 additions and 22 deletions

View File

@ -5,6 +5,7 @@ from tokenizers.models import Unigram, BPE
from tokenizers import decoders from tokenizers import decoders
from tokenizers import Tokenizer from tokenizers import Tokenizer
from tokenizers.normalizers import ( from tokenizers.normalizers import (
StripAccents,
NFKD, NFKD,
Lowercase, Lowercase,
Sequence, Sequence,
@ -165,7 +166,7 @@ class AlbertConverter(SpmConverter):
# TODO Missing Replace quotes # TODO Missing Replace quotes
if not self.original_tokenizer.keep_accents: if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD()) normalizers.append(NFKD())
# TODO Missing strip accents normalizers.append(StripAccents())
if self.original_tokenizer.do_lower_case: if self.original_tokenizer.do_lower_case:
normalizers.append(Lowercase()) normalizers.append(Lowercase())
@ -299,8 +300,16 @@ class XLNetConverter(SpmConverter):
def normalizer(self, proto): def normalizer(self, proto):
# TODO Missing Replace quotes # TODO Missing Replace quotes
# TODO Missing strip accents normalizers = []
return super().normalizer(proto) if not self.original_tokenizer.keep_accents:
normalizers.append(NFKD())
normalizers.append(StripAccents())
if self.original_tokenizer.do_lower_case:
normalizers.append(Lowercase())
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
normalizers.append(Precompiled(precompiled_charsmap))
return Sequence(normalizers)
def post_processor(self, tokenizer): def post_processor(self, tokenizer):
return TemplateProcessing( return TemplateProcessing(
@ -375,10 +384,6 @@ def check(pretrained, filename):
for i, line in enumerate(f): for i, line in enumerate(f):
line = line.strip() line = line.strip()
# TODO in normalizer
line = unicodedata.normalize("NFKD", line)
line = "".join([c for c in line if not unicodedata.combining(c)])
# TODO in normalizer # TODO in normalizer
line = line.replace("``", '"').replace("''", '"') line = line.replace("``", '"').replace("''", '"')

View File

@ -17,13 +17,6 @@ impl Normalizer for Precompiled {
normalized.get().graphemes(true).for_each(|grapheme| { normalized.get().graphemes(true).for_each(|grapheme| {
if grapheme.len() < 6 { if grapheme.len() < 6 {
if let Some(norm) = self.transform(grapheme) { if let Some(norm) = self.transform(grapheme) {
// debug!(
// "Replacing {:?}({:?}) by {:?}({:?})",
// grapheme,
// grapheme.chars().count(),
// norm,
// norm.chars().count()
// );
let old_count = grapheme.chars().count() as isize; let old_count = grapheme.chars().count() as isize;
let new_count = norm.chars().count() as isize; let new_count = norm.chars().count() as isize;
for (i, c) in norm.chars().enumerate() { for (i, c) in norm.chars().enumerate() {
@ -42,13 +35,6 @@ impl Normalizer for Precompiled {
if let Some(norm) = self.transform(part) { if let Some(norm) = self.transform(part) {
let old_count = part.chars().count() as isize; let old_count = part.chars().count() as isize;
let new_count = norm.chars().count() as isize; let new_count = norm.chars().count() as isize;
// debug!(
// "Replacing {:?}({:?}) by {:?}({:?})",
// part,
// part.chars().count(),
// norm,
// norm.chars().count()
// );
for (i, c) in norm.chars().enumerate() { for (i, c) in norm.chars().enumerate() {
let n = if i == 0 { let n = if i == 0 {
new_count - old_count new_count - old_count
@ -62,7 +48,6 @@ impl Normalizer for Precompiled {
} }
} }
}); });
// debug!("Normalized {:?}", normalized);
normalized.transform(transformations.into_iter(), 0); normalized.transform(transformations.into_iter(), 0);
Ok(()) Ok(())
} }