mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
NormalizedString - Fix added chars at beginning
This commit is contained in:
@ -91,10 +91,10 @@ impl NormalizedString {
|
|||||||
// This is a newly inserted character, so we use the alignment from the
|
// This is a newly inserted character, so we use the alignment from the
|
||||||
// previous one
|
// previous one
|
||||||
Ordering::Greater => {
|
Ordering::Greater => {
|
||||||
|
offset += 1;
|
||||||
if idx < 1 {
|
if idx < 1 {
|
||||||
Some((0, 0))
|
Some((0, 0))
|
||||||
} else {
|
} else {
|
||||||
offset += 1;
|
|
||||||
self.alignments.get(idx - 1).copied()
|
self.alignments.get(idx - 1).copied()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -330,4 +330,25 @@ mod tests {
|
|||||||
assert_eq!(world_n, "world");
|
assert_eq!(world_n, "world");
|
||||||
assert_eq!(world_o, "World");
|
assert_eq!(world_o, "World");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn added_around_edges() {
|
||||||
|
let mut n = NormalizedString::from("Hello");
|
||||||
|
n.transform(
|
||||||
|
vec![
|
||||||
|
(' ', 1),
|
||||||
|
('H', 0),
|
||||||
|
('e', 0),
|
||||||
|
('l', 0),
|
||||||
|
('l', 0),
|
||||||
|
('o', 0),
|
||||||
|
(' ', 1),
|
||||||
|
]
|
||||||
|
.into_iter(),
|
||||||
|
0,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert_eq!(&n.normalized, " Hello ");
|
||||||
|
assert_eq!(n.get_range_original(0..n.normalized.len()), Some("Hello"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user