Fixing a bug where long tokenizer files would be incorrectly deserialized (#459)

* Fixing a bug where long tokenizer files would be incorrectly
deserialized

- Add a bunch of tests to check deserialization behaviour
- One tests also confirms current Single deserialization of Sequence.

* Better test locations for Windows + no file dependency in Python binding
Rust side.

* Adressing @n1t0 comments.
This commit is contained in:
Nicolas Patry
2020-10-13 18:44:24 +02:00
committed by GitHub
parent b3c016cf9c
commit 88556790e7
8 changed files with 72 additions and 11 deletions

View File

@ -483,4 +483,35 @@ mod test {
let rs_ser = serde_json::to_string(&rs_seq).unwrap();
assert_eq!(py_wrapper_ser, rs_ser);
}
#[test]
fn deserialize_sequence() {
let string = r#"{"type": "NFKC"}"#;
let normalizer: PyNormalizer = serde_json::from_str(&string).unwrap();
match normalizer.normalizer {
PyNormalizerTypeWrapper::Single(inner) => match inner.as_ref() {
PyNormalizerWrapper::Wrapped(NormalizerWrapper::NFKC(_)) => {}
_ => panic!("Expected NFKC"),
},
_ => panic!("Expected wrapped, not sequence."),
}
let sequence_string = format!(r#"{{"type": "Sequence", "normalizers": [{}]}}"#, string);
let normalizer: PyNormalizer = serde_json::from_str(&sequence_string).unwrap();
match normalizer.normalizer {
PyNormalizerTypeWrapper::Single(inner) => match inner.as_ref() {
PyNormalizerWrapper::Wrapped(NormalizerWrapper::Sequence(sequence)) => {
let normalizers = sequence.get_normalizers();
assert_eq!(normalizers.len(), 1);
match normalizers[0] {
NormalizerWrapper::NFKC(_) => {}
_ => panic!("Expected NFKC"),
}
}
_ => panic!("Expected sequence"),
},
_ => panic!("Expected single"),
}
}
}