mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Fixing a bug where long tokenizer files would be incorrectly deserialized (#459)
* Fixing a bug where long tokenizer files would be incorrectly deserialized - Add a bunch of tests to check deserialization behaviour - One tests also confirms current Single deserialization of Sequence. * Better test locations for Windows + no file dependency in Python binding Rust side. * Adressing @n1t0 comments.
This commit is contained in:
@ -483,4 +483,35 @@ mod test {
|
||||
let rs_ser = serde_json::to_string(&rs_seq).unwrap();
|
||||
assert_eq!(py_wrapper_ser, rs_ser);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deserialize_sequence() {
|
||||
let string = r#"{"type": "NFKC"}"#;
|
||||
let normalizer: PyNormalizer = serde_json::from_str(&string).unwrap();
|
||||
match normalizer.normalizer {
|
||||
PyNormalizerTypeWrapper::Single(inner) => match inner.as_ref() {
|
||||
PyNormalizerWrapper::Wrapped(NormalizerWrapper::NFKC(_)) => {}
|
||||
_ => panic!("Expected NFKC"),
|
||||
},
|
||||
_ => panic!("Expected wrapped, not sequence."),
|
||||
}
|
||||
|
||||
let sequence_string = format!(r#"{{"type": "Sequence", "normalizers": [{}]}}"#, string);
|
||||
let normalizer: PyNormalizer = serde_json::from_str(&sequence_string).unwrap();
|
||||
|
||||
match normalizer.normalizer {
|
||||
PyNormalizerTypeWrapper::Single(inner) => match inner.as_ref() {
|
||||
PyNormalizerWrapper::Wrapped(NormalizerWrapper::Sequence(sequence)) => {
|
||||
let normalizers = sequence.get_normalizers();
|
||||
assert_eq!(normalizers.len(), 1);
|
||||
match normalizers[0] {
|
||||
NormalizerWrapper::NFKC(_) => {}
|
||||
_ => panic!("Expected NFKC"),
|
||||
}
|
||||
}
|
||||
_ => panic!("Expected sequence"),
|
||||
},
|
||||
_ => panic!("Expected single"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user