Moving StripAccents within normalizer for Albert +XLNet, but now crash

in Precompiled. offsets are wrong ?
2025-09-01 14:59:20 +00:00 · 2020-09-17 10:40:34 +02:00
parent 275ee6d4c4
commit b16406c900
2 changed files with 12 additions and 22 deletions
--- a/bindings/python/scripts/convert.py
+++ b/bindings/python/scripts/convert.py
@ -5,6 +5,7 @@ from tokenizers.models import Unigram, BPE
 from tokenizers import decoders
 from tokenizers import Tokenizer
 from tokenizers.normalizers import (
    StripAccents,
    NFKD,
    Lowercase,
    Sequence,
@ -165,7 +166,7 @@ class AlbertConverter(SpmConverter):
        # TODO Missing Replace quotes
        if not self.original_tokenizer.keep_accents:
            normalizers.append(NFKD())
-            # TODO Missing strip accents
+            normalizers.append(StripAccents())
        if self.original_tokenizer.do_lower_case:
            normalizers.append(Lowercase())
@ -299,8 +300,16 @@ class XLNetConverter(SpmConverter):
    def normalizer(self, proto):
        # TODO Missing Replace quotes
-        # TODO Missing strip accents
+        normalizers = []
-        return super().normalizer(proto)
+        if not self.original_tokenizer.keep_accents:
            normalizers.append(NFKD())
            normalizers.append(StripAccents())
        if self.original_tokenizer.do_lower_case:
            normalizers.append(Lowercase())
        precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
        normalizers.append(Precompiled(precompiled_charsmap))
        return Sequence(normalizers)
    def post_processor(self, tokenizer):
        return TemplateProcessing(
@ -375,10 +384,6 @@ def check(pretrained, filename):
        for i, line in enumerate(f):
            line = line.strip()
            # TODO in normalizer
            line = unicodedata.normalize("NFKD", line)
            line = "".join([c for c in line if not unicodedata.combining(c)])
            # TODO in normalizer
            line = line.replace("``", '"').replace("''", '"')
--- a/tokenizers/src/normalizers/precompiled.rs
+++ b/tokenizers/src/normalizers/precompiled.rs
@ -17,13 +17,6 @@ impl Normalizer for Precompiled {
        normalized.get().graphemes(true).for_each(|grapheme| {
            if grapheme.len() < 6 {
                if let Some(norm) = self.transform(grapheme) {
                    // debug!(
                    //     "Replacing {:?}({:?}) by {:?}({:?})",
                    //     grapheme,
                    //     grapheme.chars().count(),
                    //     norm,
                    //     norm.chars().count()
                    // );
                    let old_count = grapheme.chars().count() as isize;
                    let new_count = norm.chars().count() as isize;
                    for (i, c) in norm.chars().enumerate() {
@ -42,13 +35,6 @@ impl Normalizer for Precompiled {
                if let Some(norm) = self.transform(part) {
                    let old_count = part.chars().count() as isize;
                    let new_count = norm.chars().count() as isize;
                    // debug!(
                    //     "Replacing {:?}({:?}) by {:?}({:?})",
                    //     part,
                    //     part.chars().count(),
                    //     norm,
                    //     norm.chars().count()
                    // );
                    for (i, c) in norm.chars().enumerate() {
                        let n = if i == 0 {
                            new_count - old_count
@ -62,7 +48,6 @@ impl Normalizer for Precompiled {
                }
            }
        });
        // debug!("Normalized {:?}", normalized);
        normalized.transform(transformations.into_iter(), 0);
        Ok(())
    }