mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Update Tokenizer with NormalizedString & Encoding
This commit is contained in:
@ -247,8 +247,7 @@ impl Tokenizer {
|
||||
// If this is one of our added tokens, lets return an encoding directly
|
||||
if let Some(id) = id {
|
||||
return Ok(Encoding::new(
|
||||
sentence.clone(),
|
||||
sentence.clone(),
|
||||
NormalizedString::from(&sentence),
|
||||
vec![id],
|
||||
vec![type_id],
|
||||
vec![sentence.to_owned()],
|
||||
@ -260,13 +259,10 @@ impl Tokenizer {
|
||||
}
|
||||
|
||||
// 1. Normalization
|
||||
// TODO: Make sure we have the offsets update necessary to go from the original text to
|
||||
// the normalized one
|
||||
let original = sentence.clone();
|
||||
let normalized = self.normalize(sentence)?;
|
||||
let normalized = self.normalize(&sentence)?;
|
||||
|
||||
// 2. Pre tokenization
|
||||
let pre_tokenized = self.pre_tokenize(&normalized)?;
|
||||
let pre_tokenized = self.pre_tokenize(&normalized.get())?;
|
||||
|
||||
// 3. Model
|
||||
let output = self.model.tokenize(pre_tokenized)?;
|
||||
@ -287,7 +283,6 @@ impl Tokenizer {
|
||||
);
|
||||
|
||||
Ok(Encoding::new(
|
||||
original,
|
||||
normalized,
|
||||
ids,
|
||||
vec![type_id; length],
|
||||
@ -395,9 +390,12 @@ impl Tokenizer {
|
||||
|
||||
for line in file.lines() {
|
||||
let line = line?;
|
||||
let normalized = self.normalize(line)?;
|
||||
let pre_tokenized = self.pre_tokenize(&normalized)?;
|
||||
trainer.process_tokens(&mut words, pre_tokenized);
|
||||
let normalized = self.normalize(&line)?;
|
||||
let pre_tokenized = self.pre_tokenize(normalized.get())?;
|
||||
trainer.process_tokens(
|
||||
&mut words,
|
||||
pre_tokenized.into_iter().map(|(t, _)| t).collect(),
|
||||
);
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
@ -420,20 +418,22 @@ impl Tokenizer {
|
||||
}
|
||||
|
||||
/// PreTokenization logic, handling the case where there is no PreTokenizer set
|
||||
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<String>> {
|
||||
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<(String, Offsets)>> {
|
||||
match &self.pre_tokenizer {
|
||||
None => Ok(vec![sentence.to_owned()]),
|
||||
None => Ok(vec![(sentence.to_owned(), (0, sentence.len()))]),
|
||||
Some(pre_tokenizer) => pre_tokenizer.pre_tokenize(sentence),
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalization logic, go through all normalizers
|
||||
fn normalize(&self, sentence: String) -> Result<String> {
|
||||
fn normalize(&self, sequence: &str) -> Result<NormalizedString> {
|
||||
let mut normalized = NormalizedString::from(sequence);
|
||||
|
||||
if let Some(normalizer) = &self.normalizer {
|
||||
normalizer.normalize(sentence)
|
||||
} else {
|
||||
Ok(sentence)
|
||||
normalizer.normalize(&mut normalized);
|
||||
}
|
||||
|
||||
Ok(normalized)
|
||||
}
|
||||
|
||||
/// Post processing logic, handling the case where there is no PostProcessor set
|
||||
|
Reference in New Issue
Block a user