mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
make sure we don't warn on empty tokens (#1554)
* make sure we don't warn on empty tokens
* Testing the log is actually hard 😓
* mpty
This commit is contained in:
@ -75,6 +75,8 @@ unstable_wasm = ["fancy-regex", "getrandom/js"]
|
||||
criterion = "0.5"
|
||||
tempfile = "3.10"
|
||||
assert_approx_eq = "1.1"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = "0.3.18"
|
||||
|
||||
[profile.release]
|
||||
lto = "fat"
|
||||
|
@ -155,19 +155,17 @@ where
|
||||
for token in &tokens {
|
||||
// Warn the user if the id is different than expected
|
||||
let received_id = tokenizer.token_to_id(&token.token.content);
|
||||
if received_id != Some(token.id) {
|
||||
if let Some(rid) = received_id {
|
||||
if rid != token.id {
|
||||
warn!(
|
||||
"Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'",
|
||||
token.token.content,
|
||||
token.id,
|
||||
if let Some(rid) = received_id {
|
||||
rid.to_string()
|
||||
} else {
|
||||
"None".to_string()
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
let added_tokens: Vec<_> = tokens.into_iter().map(|token| token.token).collect();
|
||||
tokenizer.add_tokens(&added_tokens[..]);
|
||||
|
||||
@ -179,6 +177,7 @@ where
|
||||
mod tests {
|
||||
use crate::tokenizer::Tokenizer;
|
||||
use std::str::FromStr;
|
||||
use tracing_subscriber::fmt;
|
||||
|
||||
#[test]
|
||||
fn test_deserialization_serialization_invariant() {
|
||||
@ -233,4 +232,15 @@ mod tests {
|
||||
// It should be exactly the same as above
|
||||
assert_eq!(tok_str, tok_json);
|
||||
}
|
||||
|
||||
#[cfg(feature = "http")]
|
||||
#[test]
|
||||
fn test_from_pretrained() {
|
||||
fmt()
|
||||
.with_max_level(tracing::Level::DEBUG)
|
||||
.with_target(false)
|
||||
.init();
|
||||
let _ = Tokenizer::from_pretrained("Qwen/Qwen2-7B-Instruct", None);
|
||||
warn!("This should be the first warning");
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user