Update Tokenizer with NormalizedString & Encoding

This commit is contained in:
Anthony MOI
2019-12-28 15:28:44 -05:00
parent 4afcb1ef96
commit f4df7f5e2a

View File

@ -247,8 +247,7 @@ impl Tokenizer {
// If this is one of our added tokens, lets return an encoding directly
if let Some(id) = id {
return Ok(Encoding::new(
sentence.clone(),
sentence.clone(),
NormalizedString::from(&sentence),
vec![id],
vec![type_id],
vec![sentence.to_owned()],
@ -260,13 +259,10 @@ impl Tokenizer {
}
// 1. Normalization
// TODO: Make sure we have the offsets update necessary to go from the original text to
// the normalized one
let original = sentence.clone();
let normalized = self.normalize(sentence)?;
let normalized = self.normalize(&sentence)?;
// 2. Pre tokenization
let pre_tokenized = self.pre_tokenize(&normalized)?;
let pre_tokenized = self.pre_tokenize(&normalized.get())?;
// 3. Model
let output = self.model.tokenize(pre_tokenized)?;
@ -287,7 +283,6 @@ impl Tokenizer {
);
Ok(Encoding::new(
original,
normalized,
ids,
vec![type_id; length],
@ -395,9 +390,12 @@ impl Tokenizer {
for line in file.lines() {
let line = line?;
let normalized = self.normalize(line)?;
let pre_tokenized = self.pre_tokenize(&normalized)?;
trainer.process_tokens(&mut words, pre_tokenized);
let normalized = self.normalize(&line)?;
let pre_tokenized = self.pre_tokenize(normalized.get())?;
trainer.process_tokens(
&mut words,
pre_tokenized.into_iter().map(|(t, _)| t).collect(),
);
}
Ok(words)
@ -420,20 +418,22 @@ impl Tokenizer {
}
/// PreTokenization logic, handling the case where there is no PreTokenizer set
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<String>> {
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<(String, Offsets)>> {
match &self.pre_tokenizer {
None => Ok(vec![sentence.to_owned()]),
None => Ok(vec![(sentence.to_owned(), (0, sentence.len()))]),
Some(pre_tokenizer) => pre_tokenizer.pre_tokenize(sentence),
}
}
/// Normalization logic, go through all normalizers
fn normalize(&self, sentence: String) -> Result<String> {
fn normalize(&self, sequence: &str) -> Result<NormalizedString> {
let mut normalized = NormalizedString::from(sequence);
if let Some(normalizer) = &self.normalizer {
normalizer.normalize(sentence)
} else {
Ok(sentence)
normalizer.normalize(&mut normalized);
}
Ok(normalized)
}
/// Post processing logic, handling the case where there is no PostProcessor set