make sure we don't warn on empty tokens (#1554)

* make sure we don't warn on empty tokens

* Testing the log is actually hard 😓

* mpty
This commit is contained in:
Arthur
2024-06-20 14:33:21 +02:00
committed by GitHub
parent 3e736bbccb
commit 9441f7e8f7
2 changed files with 22 additions and 10 deletions

View File

@ -75,6 +75,8 @@ unstable_wasm = ["fancy-regex", "getrandom/js"]
criterion = "0.5"
tempfile = "3.10"
assert_approx_eq = "1.1"
tracing = "0.1"
tracing-subscriber = "0.3.18"
[profile.release]
lto = "fat"

View File

@ -155,19 +155,17 @@ where
for token in &tokens {
// Warn the user if the id is different than expected
let received_id = tokenizer.token_to_id(&token.token.content);
if received_id != Some(token.id) {
if let Some(rid) = received_id {
if rid != token.id {
warn!(
"Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'",
token.token.content,
token.id,
if let Some(rid) = received_id {
rid.to_string()
} else {
"None".to_string()
}
);
}
}
}
let added_tokens: Vec<_> = tokens.into_iter().map(|token| token.token).collect();
tokenizer.add_tokens(&added_tokens[..]);
@ -179,6 +177,7 @@ where
mod tests {
use crate::tokenizer::Tokenizer;
use std::str::FromStr;
use tracing_subscriber::fmt;
#[test]
fn test_deserialization_serialization_invariant() {
@ -233,4 +232,15 @@ mod tests {
// It should be exactly the same as above
assert_eq!(tok_str, tok_json);
}
#[cfg(feature = "http")]
#[test]
fn test_from_pretrained() {
fmt()
.with_max_level(tracing::Level::DEBUG)
.with_target(false)
.init();
let _ = Tokenizer::from_pretrained("Qwen/Qwen2-7B-Instruct", None);
warn!("This should be the first warning");
}
}