mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-01 14:59:20 +00:00
Moving StripAccents within normalizer for Albert +XLNet, but now crash
in Precompiled. offsets are wrong ?
This commit is contained in:
@ -5,6 +5,7 @@ from tokenizers.models import Unigram, BPE
|
|||||||
from tokenizers import decoders
|
from tokenizers import decoders
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tokenizers.normalizers import (
|
from tokenizers.normalizers import (
|
||||||
|
StripAccents,
|
||||||
NFKD,
|
NFKD,
|
||||||
Lowercase,
|
Lowercase,
|
||||||
Sequence,
|
Sequence,
|
||||||
@ -165,7 +166,7 @@ class AlbertConverter(SpmConverter):
|
|||||||
# TODO Missing Replace quotes
|
# TODO Missing Replace quotes
|
||||||
if not self.original_tokenizer.keep_accents:
|
if not self.original_tokenizer.keep_accents:
|
||||||
normalizers.append(NFKD())
|
normalizers.append(NFKD())
|
||||||
# TODO Missing strip accents
|
normalizers.append(StripAccents())
|
||||||
if self.original_tokenizer.do_lower_case:
|
if self.original_tokenizer.do_lower_case:
|
||||||
normalizers.append(Lowercase())
|
normalizers.append(Lowercase())
|
||||||
|
|
||||||
@ -299,8 +300,16 @@ class XLNetConverter(SpmConverter):
|
|||||||
|
|
||||||
def normalizer(self, proto):
|
def normalizer(self, proto):
|
||||||
# TODO Missing Replace quotes
|
# TODO Missing Replace quotes
|
||||||
# TODO Missing strip accents
|
normalizers = []
|
||||||
return super().normalizer(proto)
|
if not self.original_tokenizer.keep_accents:
|
||||||
|
normalizers.append(NFKD())
|
||||||
|
normalizers.append(StripAccents())
|
||||||
|
if self.original_tokenizer.do_lower_case:
|
||||||
|
normalizers.append(Lowercase())
|
||||||
|
|
||||||
|
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
|
||||||
|
normalizers.append(Precompiled(precompiled_charsmap))
|
||||||
|
return Sequence(normalizers)
|
||||||
|
|
||||||
def post_processor(self, tokenizer):
|
def post_processor(self, tokenizer):
|
||||||
return TemplateProcessing(
|
return TemplateProcessing(
|
||||||
@ -375,10 +384,6 @@ def check(pretrained, filename):
|
|||||||
for i, line in enumerate(f):
|
for i, line in enumerate(f):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
|
||||||
# TODO in normalizer
|
|
||||||
line = unicodedata.normalize("NFKD", line)
|
|
||||||
line = "".join([c for c in line if not unicodedata.combining(c)])
|
|
||||||
|
|
||||||
# TODO in normalizer
|
# TODO in normalizer
|
||||||
line = line.replace("``", '"').replace("''", '"')
|
line = line.replace("``", '"').replace("''", '"')
|
||||||
|
|
||||||
|
@ -17,13 +17,6 @@ impl Normalizer for Precompiled {
|
|||||||
normalized.get().graphemes(true).for_each(|grapheme| {
|
normalized.get().graphemes(true).for_each(|grapheme| {
|
||||||
if grapheme.len() < 6 {
|
if grapheme.len() < 6 {
|
||||||
if let Some(norm) = self.transform(grapheme) {
|
if let Some(norm) = self.transform(grapheme) {
|
||||||
// debug!(
|
|
||||||
// "Replacing {:?}({:?}) by {:?}({:?})",
|
|
||||||
// grapheme,
|
|
||||||
// grapheme.chars().count(),
|
|
||||||
// norm,
|
|
||||||
// norm.chars().count()
|
|
||||||
// );
|
|
||||||
let old_count = grapheme.chars().count() as isize;
|
let old_count = grapheme.chars().count() as isize;
|
||||||
let new_count = norm.chars().count() as isize;
|
let new_count = norm.chars().count() as isize;
|
||||||
for (i, c) in norm.chars().enumerate() {
|
for (i, c) in norm.chars().enumerate() {
|
||||||
@ -42,13 +35,6 @@ impl Normalizer for Precompiled {
|
|||||||
if let Some(norm) = self.transform(part) {
|
if let Some(norm) = self.transform(part) {
|
||||||
let old_count = part.chars().count() as isize;
|
let old_count = part.chars().count() as isize;
|
||||||
let new_count = norm.chars().count() as isize;
|
let new_count = norm.chars().count() as isize;
|
||||||
// debug!(
|
|
||||||
// "Replacing {:?}({:?}) by {:?}({:?})",
|
|
||||||
// part,
|
|
||||||
// part.chars().count(),
|
|
||||||
// norm,
|
|
||||||
// norm.chars().count()
|
|
||||||
// );
|
|
||||||
for (i, c) in norm.chars().enumerate() {
|
for (i, c) in norm.chars().enumerate() {
|
||||||
let n = if i == 0 {
|
let n = if i == 0 {
|
||||||
new_count - old_count
|
new_count - old_count
|
||||||
@ -62,7 +48,6 @@ impl Normalizer for Precompiled {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
// debug!("Normalized {:?}", normalized);
|
|
||||||
normalized.transform(transformations.into_iter(), 0);
|
normalized.transform(transformations.into_iter(), 0);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user