mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 12:48:18 +00:00
Implement suggestions by @sebpuetz
Co-authored-by: Sebastian Pütz <sebastian.puetz@uni-tuebingen.de>
This commit is contained in:
@@ -61,7 +61,7 @@ impl PreTokenizer {
|
|||||||
.into_py()?;
|
.into_py()?;
|
||||||
|
|
||||||
Ok(pretokenized
|
Ok(pretokenized
|
||||||
.get_normalized(true)
|
.get_normalized(tk::OffsetReferential::Original)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(s, o)| (s.to_owned(), o))
|
.map(|(s, o)| (s.to_owned(), o))
|
||||||
.collect())
|
.collect())
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ impl PreTokenizer for BertPreTokenizer {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use crate::OffsetReferential;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn basic() {
|
fn basic() {
|
||||||
@@ -39,7 +40,7 @@ mod tests {
|
|||||||
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
|
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
|
||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(true),
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
vec![
|
vec![
|
||||||
("Hey", (0, 3)),
|
("Hey", (0, 3)),
|
||||||
("", (3, 4)),
|
("", (3, 4)),
|
||||||
|
|||||||
@@ -218,8 +218,8 @@ pub fn process_offsets(encoding: &mut Encoding, add_prefix_space: bool) {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::ByteLevel;
|
use super::ByteLevel;
|
||||||
use crate::tokenizer::{
|
use crate::tokenizer::{
|
||||||
normalizer::Range, Decoder, Encoding, NormalizedString, PostProcessor, PreTokenizedString,
|
normalizer::Range, Decoder, Encoding, NormalizedString, OffsetReferential, PostProcessor,
|
||||||
PreTokenizer,
|
PreTokenizedString, PreTokenizer,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -228,7 +228,7 @@ mod tests {
|
|||||||
let mut pretokenized: PreTokenizedString = "Hello my friend, how is your day going?".into();
|
let mut pretokenized: PreTokenizedString = "Hello my friend, how is your day going?".into();
|
||||||
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(true),
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
vec![
|
vec![
|
||||||
("Hello", (0, 5)),
|
("Hello", (0, 5)),
|
||||||
("Ġmy", (5, 8)),
|
("Ġmy", (5, 8)),
|
||||||
@@ -273,7 +273,7 @@ mod tests {
|
|||||||
let mut pretokenized = PreTokenizedString::from(*s);
|
let mut pretokenized = PreTokenizedString::from(*s);
|
||||||
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(false),
|
pretokenized.get_normalized(OffsetReferential::Normalized),
|
||||||
vec![
|
vec![
|
||||||
("ĠHello", (0, 6)),
|
("ĠHello", (0, 6)),
|
||||||
("Ġmy", (6, 9)),
|
("Ġmy", (6, 9)),
|
||||||
@@ -317,7 +317,7 @@ mod tests {
|
|||||||
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(true),
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
vec![
|
vec![
|
||||||
("Hello", (0, 5)),
|
("Hello", (0, 5)),
|
||||||
("Ġthere", (5, 11)),
|
("Ġthere", (5, 11)),
|
||||||
@@ -335,7 +335,7 @@ mod tests {
|
|||||||
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(true),
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
vec![
|
vec![
|
||||||
("Hello", (0, 5)),
|
("Hello", (0, 5)),
|
||||||
("Ġthere", (5, 11)),
|
("Ġthere", (5, 11)),
|
||||||
@@ -352,11 +352,11 @@ mod tests {
|
|||||||
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
bytelevel.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(true),
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
vec![("i", (0, 1)), ("âŃ¢", (1, 2)), ("j", (2, 3))]
|
vec![("i", (0, 1)), ("âŃ¢", (1, 2)), ("j", (2, 3))]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(false),
|
pretokenized.get_normalized(OffsetReferential::Normalized),
|
||||||
vec![("i", (0, 1)), ("âŃ¢", (1, 4)), ("j", (4, 5))]
|
vec![("i", (0, 1)), ("âŃ¢", (1, 4)), ("j", (4, 5))]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
@@ -6,25 +6,18 @@ use serde::{Deserialize, Serialize};
|
|||||||
/// splits on this character
|
/// splits on this character
|
||||||
pub struct Metaspace {
|
pub struct Metaspace {
|
||||||
replacement: char,
|
replacement: char,
|
||||||
str_bytes: [u8; 4],
|
str_rep: String,
|
||||||
add_prefix_space: bool,
|
add_prefix_space: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Metaspace {
|
impl Metaspace {
|
||||||
pub fn new(replacement: char, add_prefix_space: bool) -> Self {
|
pub fn new(replacement: char, add_prefix_space: bool) -> Self {
|
||||||
let mut str_bytes = [0; 4];
|
|
||||||
replacement.encode_utf8(&mut str_bytes);
|
|
||||||
Self {
|
Self {
|
||||||
replacement,
|
replacement,
|
||||||
str_bytes,
|
str_rep: replacement.to_string(),
|
||||||
add_prefix_space,
|
add_prefix_space,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
|
||||||
fn replacement(&self) -> &str {
|
|
||||||
unsafe { std::str::from_utf8_unchecked(&self.str_bytes[..self.replacement.len_utf8()]) }
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for Metaspace {
|
impl Default for Metaspace {
|
||||||
@@ -38,14 +31,14 @@ impl PreTokenizer for Metaspace {
|
|||||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||||
pretokenized.split(|_, mut normalized| {
|
pretokenized.split(|_, mut normalized| {
|
||||||
if self.add_prefix_space {
|
if self.add_prefix_space {
|
||||||
normalized.prepend(&self.replacement());
|
normalized.prepend(&self.str_rep);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(normalized
|
Ok(normalized
|
||||||
.split(' ', SplitDelimiterBehavior::MergedWithNext)?
|
.split(' ', SplitDelimiterBehavior::MergedWithNext)?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|mut normalized| {
|
.map(|mut normalized| {
|
||||||
normalized.replace(' ', self.replacement())?;
|
normalized.replace(' ', &self.str_rep)?;
|
||||||
Ok(normalized)
|
Ok(normalized)
|
||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>>>()?)
|
.collect::<Result<Vec<_>>>()?)
|
||||||
@@ -78,6 +71,7 @@ impl Decoder for Metaspace {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use crate::OffsetReferential;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn basic() {
|
fn basic() {
|
||||||
@@ -85,11 +79,11 @@ mod tests {
|
|||||||
let mut pretokenized = PreTokenizedString::from("Hey friend!");
|
let mut pretokenized = PreTokenizedString::from("Hey friend!");
|
||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(false),
|
pretokenized.get_normalized(OffsetReferential::Normalized),
|
||||||
vec![("▁Hey", (0, 4)), ("▁friend!", (4, 12))]
|
vec![("▁Hey", (0, 4)), ("▁friend!", (4, 12))]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(true),
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
vec![("▁Hey", (0, 3)), ("▁friend!", (3, 11))]
|
vec![("▁Hey", (0, 3)), ("▁friend!", (3, 11))]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -100,7 +94,7 @@ mod tests {
|
|||||||
let mut pretokenized = PreTokenizedString::from("Hey friend!");
|
let mut pretokenized = PreTokenizedString::from("Hey friend!");
|
||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(false),
|
pretokenized.get_normalized(OffsetReferential::Normalized),
|
||||||
vec![
|
vec![
|
||||||
("▁Hey", (0, 4)),
|
("▁Hey", (0, 4)),
|
||||||
("▁", (4, 5)),
|
("▁", (4, 5)),
|
||||||
@@ -109,7 +103,7 @@ mod tests {
|
|||||||
]
|
]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized.get_normalized(true),
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
vec![
|
vec![
|
||||||
("▁Hey", (0, 3)),
|
("▁Hey", (0, 3)),
|
||||||
("▁", (3, 4)),
|
("▁", (3, 4)),
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ impl PreTokenizer for WhitespaceSplit {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::tokenizer::PreTokenizer;
|
use crate::{OffsetReferential, PreTokenizer};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn basic() {
|
fn basic() {
|
||||||
@@ -77,7 +77,10 @@ mod tests {
|
|||||||
for (s, res) in tests {
|
for (s, res) in tests {
|
||||||
let mut pretokenized = PreTokenizedString::from(s);
|
let mut pretokenized = PreTokenizedString::from(s);
|
||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(pretokenized.get_normalized(true), res);
|
assert_eq!(
|
||||||
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
|
res
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -103,7 +106,10 @@ mod tests {
|
|||||||
for (s, res) in tests {
|
for (s, res) in tests {
|
||||||
let mut pretokenized = PreTokenizedString::from(s);
|
let mut pretokenized = PreTokenizedString::from(s);
|
||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(pretokenized.get_normalized(true), res);
|
assert_eq!(
|
||||||
|
pretokenized.get_normalized(OffsetReferential::Original),
|
||||||
|
res
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -454,24 +454,22 @@ impl AddedVocabulary {
|
|||||||
pretokenized
|
pretokenized
|
||||||
.split(|i, mut sequence| {
|
.split(|i, mut sequence| {
|
||||||
if let Some(id) = indices[i] {
|
if let Some(id) = indices[i] {
|
||||||
multi_indices.push(vec![Some(id)]);
|
multi_indices.push(Some(id));
|
||||||
Ok(itertools::Either::Left(std::iter::once(sequence)))
|
Ok(itertools::Either::Left(std::iter::once(sequence)))
|
||||||
} else {
|
} else {
|
||||||
normalizer.map(|n| n.normalize(&mut sequence));
|
normalizer.map(|n| n.normalize(&mut sequence));
|
||||||
|
|
||||||
let (idcs, split) =
|
let (idcs, split) =
|
||||||
self.split_with_indices(sequence, &self.split_normalized_re);
|
self.split_with_indices(sequence, &self.split_normalized_re);
|
||||||
multi_indices.push(idcs);
|
multi_indices.extend(idcs);
|
||||||
Ok(itertools::Either::Right(split))
|
Ok(itertools::Either::Right(split))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.expect("AddedVocabulary bad split");
|
.expect("AddedVocabulary bad split");
|
||||||
|
|
||||||
let indices = multi_indices.into_iter().flatten().collect::<Vec<_>>();
|
|
||||||
|
|
||||||
pretokenized
|
pretokenized
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.zip(indices)
|
.zip(multi_indices)
|
||||||
.map(|(substring, id)| {
|
.map(|(substring, id)| {
|
||||||
(
|
(
|
||||||
substring.normalized,
|
substring.normalized,
|
||||||
|
|||||||
@@ -47,6 +47,19 @@ impl Encoding {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn with_capacity(len: usize) -> Self {
|
||||||
|
Encoding {
|
||||||
|
ids: Vec::with_capacity(len),
|
||||||
|
type_ids: Vec::with_capacity(len),
|
||||||
|
tokens: Vec::with_capacity(len),
|
||||||
|
words: Vec::with_capacity(len),
|
||||||
|
offsets: Vec::with_capacity(len),
|
||||||
|
special_tokens_mask: Vec::with_capacity(len),
|
||||||
|
attention_mask: Vec::with_capacity(len),
|
||||||
|
overflowing: vec![],
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn from_tokens(tokens: Vec<Token>, type_id: u32) -> Self {
|
pub fn from_tokens(tokens: Vec<Token>, type_id: u32) -> Self {
|
||||||
let length = tokens.len();
|
let length = tokens.len();
|
||||||
let (ids, tokens, offsets) = tokens.into_iter().fold(
|
let (ids, tokens, offsets) = tokens.into_iter().fold(
|
||||||
@@ -404,6 +417,27 @@ impl std::iter::FromIterator<Encoding> for Encoding {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl std::iter::FromIterator<(u32, String, (usize, usize), Option<u32>, u32)> for Encoding {
|
||||||
|
fn from_iter<I: IntoIterator<Item = (u32, String, (usize, usize), Option<u32>, u32)>>(
|
||||||
|
iter: I,
|
||||||
|
) -> Self {
|
||||||
|
let items = iter.into_iter();
|
||||||
|
let (lower, upper) = items.size_hint();
|
||||||
|
let length = upper.unwrap_or(lower);
|
||||||
|
let mut encoding = Self::with_capacity(length);
|
||||||
|
|
||||||
|
for (id, token, offsets, word, type_id) in items {
|
||||||
|
encoding.ids.push(id);
|
||||||
|
encoding.tokens.push(token);
|
||||||
|
encoding.offsets.push(offsets);
|
||||||
|
encoding.type_ids.push(type_id);
|
||||||
|
encoding.words.push(word);
|
||||||
|
}
|
||||||
|
|
||||||
|
encoding
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn get_current_part<T: Clone>(
|
fn get_current_part<T: Clone>(
|
||||||
prev: &[T],
|
prev: &[T],
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ mod serialization;
|
|||||||
|
|
||||||
pub use added_vocabulary::*;
|
pub use added_vocabulary::*;
|
||||||
pub use encoding::*;
|
pub use encoding::*;
|
||||||
pub use normalizer::{NormalizedString, SplitDelimiterBehavior};
|
pub use normalizer::{NormalizedString, OffsetReferential, SplitDelimiterBehavior};
|
||||||
pub use pre_tokenizer::*;
|
pub use pre_tokenizer::*;
|
||||||
|
|
||||||
pub type Error = Box<dyn std::error::Error + Send + Sync>;
|
pub type Error = Box<dyn std::error::Error + Send + Sync>;
|
||||||
@@ -423,6 +423,7 @@ impl Tokenizer {
|
|||||||
.added_vocabulary
|
.added_vocabulary
|
||||||
.extract_and_normalize(self.normalizer.as_deref(), &subseq)
|
.extract_and_normalize(self.normalizer.as_deref(), &subseq)
|
||||||
.map(|(normalized, original_offsets, id)| match id {
|
.map(|(normalized, original_offsets, id)| match id {
|
||||||
|
// This is an added token, no need to tokenize, we have the ID
|
||||||
Some(id) => {
|
Some(id) => {
|
||||||
let mut encoding = Encoding::from_tokens(
|
let mut encoding = Encoding::from_tokens(
|
||||||
vec![Token::new(
|
vec![Token::new(
|
||||||
@@ -435,6 +436,7 @@ impl Tokenizer {
|
|||||||
encoding.get_words_mut()[0] = Some(0);
|
encoding.get_words_mut()[0] = Some(0);
|
||||||
Ok(encoding)
|
Ok(encoding)
|
||||||
}
|
}
|
||||||
|
// Let's tokenize
|
||||||
None => self.do_tokenize(
|
None => self.do_tokenize(
|
||||||
self.do_pre_tokenize(normalized)?,
|
self.do_pre_tokenize(normalized)?,
|
||||||
original_offsets,
|
original_offsets,
|
||||||
@@ -675,45 +677,40 @@ impl Tokenizer {
|
|||||||
) -> Result<Encoding> {
|
) -> Result<Encoding> {
|
||||||
let pretokenized: PreTokenizedString = pretokenized.into();
|
let pretokenized: PreTokenizedString = pretokenized.into();
|
||||||
|
|
||||||
let mut empty_words = 0;
|
|
||||||
pretokenized
|
pretokenized
|
||||||
.into_iter()
|
.into_iter()
|
||||||
|
.filter(|substr| !substr.normalized.is_empty())
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(word_idx, substr)| {
|
.flat_map(|(word_idx, substr)| {
|
||||||
if substr.normalized.is_empty() {
|
match self.model.tokenize(substr.normalized.get()) {
|
||||||
empty_words += 1;
|
Ok(tokens) => {
|
||||||
return Ok(Encoding::default());
|
itertools::Either::Left(tokens.into_iter().map(move |token| {
|
||||||
}
|
|
||||||
|
|
||||||
let mut tokens = self.model.tokenize(substr.normalized.get())?;
|
|
||||||
|
|
||||||
// Update the offsets to match the original input
|
|
||||||
tokens.iter_mut().for_each(|token| {
|
|
||||||
// We convert the normalized offsets back to the original
|
// We convert the normalized offsets back to the original
|
||||||
let converted_offsets = substr
|
let converted_offsets = substr
|
||||||
.normalized
|
.normalized
|
||||||
.convert_offsets(Range::Normalized(token.offsets.0..token.offsets.1))
|
.convert_offsets(Range::Normalized(
|
||||||
|
token.offsets.0..token.offsets.1,
|
||||||
|
))
|
||||||
.map_or(token.offsets, |range| {
|
.map_or(token.offsets, |range| {
|
||||||
(
|
(
|
||||||
original_offsets.0 + substr.original_offsets.0 + range.start,
|
original_offsets.0
|
||||||
|
+ substr.original_offsets.0
|
||||||
|
+ range.start,
|
||||||
original_offsets.0 + substr.original_offsets.0 + range.end,
|
original_offsets.0 + substr.original_offsets.0 + range.end,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
|
||||||
// And we update the token to these original offsets, applying the original offset
|
Ok((
|
||||||
// of the sequence we just tokenized.
|
token.id,
|
||||||
token.offsets = converted_offsets;
|
token.value,
|
||||||
});
|
converted_offsets,
|
||||||
|
Some(word_idx as u32),
|
||||||
// Then build the encoding from these tokens, setting the `words` as relevant
|
type_id,
|
||||||
let mut encoding = Encoding::from_tokens(tokens, type_id);
|
))
|
||||||
encoding.get_words_mut().iter_mut().for_each(|word| {
|
}))
|
||||||
// empty words are generally spaces, and other things
|
}
|
||||||
// that were normalized out, so we dont want to count them in.
|
Err(e) => itertools::Either::Right(std::iter::once(Err(e))),
|
||||||
*word = Some(word_idx as u32 - empty_words);
|
}
|
||||||
});
|
|
||||||
|
|
||||||
Ok(encoding)
|
|
||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,6 +5,12 @@ use crate::{Offsets, Result};
|
|||||||
use std::ops::{Bound, RangeBounds};
|
use std::ops::{Bound, RangeBounds};
|
||||||
use unicode_normalization_alignments::UnicodeNormalization;
|
use unicode_normalization_alignments::UnicodeNormalization;
|
||||||
|
|
||||||
|
/// The possible offsets referential
|
||||||
|
pub enum OffsetReferential {
|
||||||
|
Original,
|
||||||
|
Normalized,
|
||||||
|
}
|
||||||
|
|
||||||
/// Represents a Range usable by the NormalizedString to index its content.
|
/// Represents a Range usable by the NormalizedString to index its content.
|
||||||
/// A Range can use indices relative to either the `Original` or the `Normalized` string
|
/// A Range can use indices relative to either the `Original` or the `Normalized` string
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
@@ -300,11 +306,11 @@ impl NormalizedString {
|
|||||||
|
|
||||||
Some(Self {
|
Some(Self {
|
||||||
original: get_range_of(&self.original, r_original)
|
original: get_range_of(&self.original, r_original)
|
||||||
.unwrap_or("")
|
.unwrap_or_default()
|
||||||
.to_owned(),
|
.into(),
|
||||||
normalized: get_range_of(&self.normalized, r_normalized.clone())
|
normalized: get_range_of(&self.normalized, r_normalized.clone())
|
||||||
.unwrap_or("")
|
.unwrap_or_default()
|
||||||
.to_owned(),
|
.into(),
|
||||||
alignments: self
|
alignments: self
|
||||||
.alignments
|
.alignments
|
||||||
.get(r_normalized)?
|
.get(r_normalized)?
|
||||||
@@ -462,7 +468,7 @@ impl NormalizedString {
|
|||||||
pub fn replace<P: Pattern>(&mut self, pattern: P, content: &str) -> Result<()> {
|
pub fn replace<P: Pattern>(&mut self, pattern: P, content: &str) -> Result<()> {
|
||||||
let matches = pattern.find_matches(&self.normalized)?;
|
let matches = pattern.find_matches(&self.normalized)?;
|
||||||
|
|
||||||
let (normalized, alignments): (Vec<char>, Vec<Offsets>) = matches
|
let (normalized, alignments): (String, Vec<Offsets>) = matches
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.flat_map(|((start, end), is_match)| {
|
.flat_map(|((start, end), is_match)| {
|
||||||
let len = end - start;
|
let len = end - start;
|
||||||
@@ -490,7 +496,7 @@ impl NormalizedString {
|
|||||||
})
|
})
|
||||||
.unzip();
|
.unzip();
|
||||||
|
|
||||||
self.normalized = normalized.into_iter().collect();
|
self.normalized = normalized;
|
||||||
self.alignments = alignments;
|
self.alignments = alignments;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -31,34 +31,18 @@ impl Pattern for &Regex {
|
|||||||
return Ok(vec![((0, 0), false)]);
|
return Ok(vec![((0, 0), false)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Find initial matches
|
let mut prev = 0;
|
||||||
let matches = self
|
let mut splits = Vec::with_capacity(inside.len());
|
||||||
.find_iter(inside)
|
for m in self.find_iter(inside) {
|
||||||
.map(|m| ((m.start(), m.end()), true))
|
if prev != m.start() {
|
||||||
.collect::<Vec<_>>();
|
splits.push(((prev, m.start()), false));
|
||||||
|
|
||||||
// Then add missing splits inbetween
|
|
||||||
let mut start_offset = 0;
|
|
||||||
let mut splits = matches
|
|
||||||
.into_iter()
|
|
||||||
.flat_map(|((start, end), flag)| {
|
|
||||||
let mut splits = vec![];
|
|
||||||
if start_offset < start {
|
|
||||||
splits.push(((start_offset, start), false));
|
|
||||||
}
|
}
|
||||||
splits.push(((start, end), flag));
|
splits.push(((m.start(), m.end()), true));
|
||||||
start_offset = end;
|
prev = m.end();
|
||||||
|
|
||||||
splits
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
if let Some(((_, end), _)) = splits.iter().last().copied() {
|
|
||||||
if end < inside.len() {
|
|
||||||
splits.push(((end, inside.len()), false));
|
|
||||||
}
|
}
|
||||||
|
if prev != inside.len() {
|
||||||
|
splits.push(((prev, inside.len()), false))
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(splits)
|
Ok(splits)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
use crate::{NormalizedString, Offsets, Result};
|
use crate::{NormalizedString, OffsetReferential, Offsets, Result};
|
||||||
|
|
||||||
/// Wrapper for a subpart of a `NormalizedString`.
|
/// Wrapper for a subpart of a `NormalizedString`.
|
||||||
///
|
///
|
||||||
@@ -16,7 +16,15 @@ pub struct SubString {
|
|||||||
pub original_offsets: Offsets,
|
pub original_offsets: Offsets,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A `PreTokenizedString` takes care of splitting the input string in multiple
|
impl SubString {
|
||||||
|
pub fn new(normalized: NormalizedString, original_offsets: Offsets) -> Self {
|
||||||
|
Self {
|
||||||
|
normalized,
|
||||||
|
original_offsets,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// sub strings, while ensuring that they form a coherend group. This let us keep
|
/// sub strings, while ensuring that they form a coherend group. This let us keep
|
||||||
/// track of the offsets during the whole normalization and pre-tokenization steps.
|
/// track of the offsets during the whole normalization and pre-tokenization steps.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@@ -42,52 +50,28 @@ impl PreTokenizedString {
|
|||||||
F: FnMut(usize, NormalizedString) -> Result<U>,
|
F: FnMut(usize, NormalizedString) -> Result<U>,
|
||||||
U: IntoIterator<Item = NormalizedString>,
|
U: IntoIterator<Item = NormalizedString>,
|
||||||
{
|
{
|
||||||
self.parts = self
|
// new_parts is at least as big as self.parts
|
||||||
.parts
|
let mut new_parts = Vec::with_capacity(self.parts.len());
|
||||||
.drain(..)
|
for (i, sub) in self.parts.drain(..).enumerate() {
|
||||||
.enumerate()
|
|
||||||
.flat_map(|(i, sub)| {
|
|
||||||
let original_len = sub.normalized.len_original();
|
let original_len = sub.normalized.len_original();
|
||||||
let original_offsets = sub.original_offsets;
|
let original_offsets = sub.original_offsets;
|
||||||
|
|
||||||
let mut new_len = 0;
|
let mut new_len = 0;
|
||||||
let res = split_fn(i, sub.normalized);
|
new_parts.extend(split_fn(i, sub.normalized)?.into_iter().map(|normalized| {
|
||||||
if let Err(e) = res {
|
|
||||||
return itertools::Either::Left(std::iter::once(Err(e)));
|
|
||||||
}
|
|
||||||
|
|
||||||
let parts = res
|
|
||||||
.unwrap()
|
|
||||||
.into_iter()
|
|
||||||
.map(|normalized| {
|
|
||||||
let len = normalized.len_original();
|
let len = normalized.len_original();
|
||||||
let new_s = SubString {
|
let start = original_offsets.0 + new_len;
|
||||||
normalized,
|
let end = original_offsets.0 + new_len + len;
|
||||||
original_offsets: (
|
let new_s = SubString::new(normalized, (start, end));
|
||||||
original_offsets.0 + new_len,
|
|
||||||
original_offsets.0 + new_len + len,
|
|
||||||
),
|
|
||||||
};
|
|
||||||
new_len += len;
|
new_len += len;
|
||||||
new_s
|
new_s
|
||||||
})
|
}));
|
||||||
.collect::<Vec<_>>();
|
if original_len != new_len {
|
||||||
|
return Err(
|
||||||
if new_len != original_len {
|
"Split pre-tokenized string must represent the entire original string".into(),
|
||||||
println!(
|
|
||||||
"Original offsets: {:?}\nNew: {:?}",
|
|
||||||
(0, original_len),
|
|
||||||
(0, new_len)
|
|
||||||
);
|
);
|
||||||
itertools::Either::Left(std::iter::once(Err(
|
|
||||||
"Split pre-tokenized string must represent the entire original string"
|
|
||||||
.into(),
|
|
||||||
)))
|
|
||||||
} else {
|
|
||||||
itertools::Either::Right(parts.into_iter().map(Ok))
|
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
.collect::<Result<Vec<_>>>()?;
|
self.parts = new_parts;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -98,19 +82,20 @@ impl PreTokenizedString {
|
|||||||
|
|
||||||
/// Returns a list of normalized string and the associated offsets,
|
/// Returns a list of normalized string and the associated offsets,
|
||||||
/// either in original or normalized referential
|
/// either in original or normalized referential
|
||||||
pub fn get_normalized(&self, original: bool) -> Vec<(&str, Offsets)> {
|
pub fn get_normalized(&self, offset_type: OffsetReferential) -> Vec<(&str, Offsets)> {
|
||||||
let mut offset = 0;
|
let mut offset = 0;
|
||||||
self.iter()
|
self.iter()
|
||||||
.map(|sub| {
|
.map(|sub| {
|
||||||
let offsets = if original {
|
let offsets = match offset_type {
|
||||||
(
|
OffsetReferential::Original => (
|
||||||
sub.original_offsets.0,
|
sub.original_offsets.0,
|
||||||
sub.original_offsets.0 + sub.normalized.len_original(),
|
sub.original_offsets.0 + sub.normalized.len_original(),
|
||||||
)
|
),
|
||||||
} else {
|
OffsetReferential::Normalized => {
|
||||||
let len = sub.normalized.len();
|
let len = sub.normalized.len();
|
||||||
offset += len;
|
offset += len;
|
||||||
(offset - len, offset)
|
(offset - len, offset)
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
(sub.normalized.get(), offsets)
|
(sub.normalized.get(), offsets)
|
||||||
@@ -176,4 +161,3 @@ impl<'a> IntoIterator for &'a PreTokenizedString {
|
|||||||
self.parts.iter()
|
self.parts.iter()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user