mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
@ -34,20 +34,26 @@ impl<'a> Serialize for OrderedVocabIter<'a> {
|
|||||||
S: Serializer,
|
S: Serializer,
|
||||||
{
|
{
|
||||||
// There could be holes so max + 1 is more correct than vocab_r.len()
|
// There could be holes so max + 1 is more correct than vocab_r.len()
|
||||||
if let Some(max) = self.vocab_r.iter().map(|(key, _)| key).max() {
|
let mut holes = vec![];
|
||||||
|
let result = if let Some(max) = self.vocab_r.iter().map(|(key, _)| key).max() {
|
||||||
let iter = (0..*max + 1).filter_map(|i| {
|
let iter = (0..*max + 1).filter_map(|i| {
|
||||||
if let Some(token) = self.vocab_r.get(&i){
|
if let Some(token) = self.vocab_r.get(&i){
|
||||||
Some((token, i))
|
Some((token, i))
|
||||||
}else{
|
}else{
|
||||||
warn!("The OrderedVocab you are attempting to save contains a hole for index {}, your vocabulary could be corrupted !", i);
|
holes.push(i);
|
||||||
println!("The OrderedVocab you are attempting to save contains a hole for index {}, your vocabulary could be corrupted !", i);
|
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
serializer.collect_map(iter)
|
serializer.collect_map(iter)
|
||||||
} else {
|
} else {
|
||||||
serializer.collect_map(std::iter::empty::<(&str, u32)>())
|
serializer.collect_map(std::iter::empty::<(&str, u32)>())
|
||||||
|
};
|
||||||
|
|
||||||
|
if !holes.is_empty(){
|
||||||
|
warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
|
||||||
|
println!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
|
||||||
}
|
}
|
||||||
|
result
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user