From adc82cb49a24e52804f950bd95ff8f625755d85a Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:08:12 +0200 Subject: [PATCH] Add-legacy-tests (#1597) * add tests * decoder as well * check error * propagate * lint * rafiune the test * lint * revert decoder changes * on more? * fmt * Update tokenizers/src/pre_tokenizers/mod.rs Co-authored-by: Nicolas Patry * fix commit * simplify err * fmt --------- Co-authored-by: Nicolas Patry --- tokenizers/src/pre_tokenizers/mod.rs | 39 ++++++++++++++++++++++++++++ tokenizers/src/processors/mod.rs | 38 +++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index cf64fb87..c1264610 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -144,4 +144,43 @@ mod tests { PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit {}) ); } + + #[test] + fn pre_tokenizer_deserialization_no_type() { + let json = r#"{"replacement":"▁","add_prefix_space":true, "prepend_scheme":"always"}}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } + + let json = r#"{"type":"Metaspace", "replacement":"▁" }"#; + let reconstructed = serde_json::from_str::(json); + assert_eq!( + reconstructed.unwrap(), + PreTokenizerWrapper::Metaspace(Metaspace::default()) + ); + + let json = r#"{"type":"Metaspace", "add_prefix_space":true }"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } + let json = r#"{"behavior":"default_split"}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PreTokenizerWrapper" + ), + _ => panic!("Expected an error here"), + } + } } diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index 130a537b..869cc689 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -87,4 +87,42 @@ mod tests { PostProcessorWrapper::Bert(bert) ); } + + #[test] + fn post_processor_deserialization_no_type() { + let json = r#"{"add_prefix_space": true, "trim_offsets": false, "use_regex": false}"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PostProcessorWrapper" + ), + _ => panic!("Expected an error here"), + } + + let json = r#"{"sep":["[SEP]",102],"cls":["[CLS]",101]}"#; + let reconstructed = serde_json::from_str::(json); + assert!(matches!( + reconstructed.unwrap(), + PostProcessorWrapper::Bert(_) + )); + + let json = + r#"{"sep":["",2], "cls":["",0], "trim_offsets":true, "add_prefix_space":true}"#; + let reconstructed = serde_json::from_str::(json); + assert!(matches!( + reconstructed.unwrap(), + PostProcessorWrapper::Roberta(_) + )); + + let json = r#"{"type":"RobertaProcessing", "sep":["",2] }"#; + let reconstructed = serde_json::from_str::(json); + match reconstructed { + Err(err) => assert_eq!( + err.to_string(), + "data did not match any variant of untagged enum PostProcessorWrapper" + ), + _ => panic!("Expected an error here"), + } + } }