mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Fixing padding_left sequence_ids. (#1233)
This commit is contained in:
@ -512,6 +512,11 @@ impl Encoding {
|
|||||||
.map(|_| (0, 0))
|
.map(|_| (0, 0))
|
||||||
.chain(self.offsets.drain(..))
|
.chain(self.offsets.drain(..))
|
||||||
.collect();
|
.collect();
|
||||||
|
self.sequence_ranges
|
||||||
|
.iter_mut()
|
||||||
|
.for_each(|(_seq_id, range)| {
|
||||||
|
*range = (range.start + pad_length)..(range.end + pad_length)
|
||||||
|
});
|
||||||
}
|
}
|
||||||
PaddingDirection::Right => {
|
PaddingDirection::Right => {
|
||||||
self.ids.extend((0..pad_length).map(|_| pad_id));
|
self.ids.extend((0..pad_length).map(|_| pad_id));
|
||||||
@ -874,4 +879,31 @@ mod tests {
|
|||||||
assert_eq!(encoding.char_to_word(2, 1), Some(0));
|
assert_eq!(encoding.char_to_word(2, 1), Some(0));
|
||||||
assert_eq!(encoding.char_to_word(9, 1), Some(2));
|
assert_eq!(encoding.char_to_word(9, 1), Some(2));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn padding() {
|
||||||
|
let mut a = Encoding {
|
||||||
|
ids: vec![1],
|
||||||
|
type_ids: vec![0],
|
||||||
|
tokens: vec![String::from("Hello ")],
|
||||||
|
words: vec![Some(0)],
|
||||||
|
offsets: vec![(0, 6)],
|
||||||
|
special_tokens_mask: vec![0],
|
||||||
|
attention_mask: vec![1],
|
||||||
|
sequence_ranges: HashMap::from([(0, 0..1)]),
|
||||||
|
..Default::default()
|
||||||
|
};
|
||||||
|
let target_length = 2;
|
||||||
|
let pad_id = 99;
|
||||||
|
let pad_type_id = 0;
|
||||||
|
let pad_token = "[PAD]";
|
||||||
|
a.pad(
|
||||||
|
target_length,
|
||||||
|
pad_id,
|
||||||
|
pad_type_id,
|
||||||
|
pad_token,
|
||||||
|
PaddingDirection::Left,
|
||||||
|
);
|
||||||
|
assert_eq!(a.sequence_ranges, HashMap::from([(0, 1..2)]));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user