mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
fix and update tes
This commit is contained in:
@ -255,36 +255,36 @@ impl AddedVocabulary {
|
|||||||
// Then we delegate to `add_tokens`, that will take care of refreshing added tokens too.
|
// Then we delegate to `add_tokens`, that will take care of refreshing added tokens too.
|
||||||
let mut ignored = 0;
|
let mut ignored = 0;
|
||||||
for token in tokens {
|
for token in tokens {
|
||||||
if token.content.is_empty() {
|
if token.content.is_empty() || self.added_tokens_map_r.values().any(|val| val == token)
|
||||||
|
{
|
||||||
ignored += 1;
|
ignored += 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If a token is already part of the vocabulary, we mark it as added
|
// If a token is already part of the vocabulary, we mark it as added
|
||||||
let id = if let Some(id) = self.token_to_id(&token.content, model) {
|
let new_id = if let Some(new_id) = self.token_to_id(&token.content, model) {
|
||||||
id
|
|
||||||
} else {
|
|
||||||
let new_id = (model.get_vocab_size() + cmp::max(self.added_tokens_map_r.keys(),0)) as u32;
|
|
||||||
new_id
|
new_id
|
||||||
};
|
|
||||||
|
|
||||||
if self.added_tokens_map_r.values().any(|val| val == token) {
|
|
||||||
// We only ignore if the AddedToken is already part of the added_tokens_map_r
|
|
||||||
ignored += 1;
|
|
||||||
} else {
|
} else {
|
||||||
|
self.added_tokens_map
|
||||||
|
.values()
|
||||||
|
.cloned()
|
||||||
|
.max()
|
||||||
|
.map_or(model.get_vocab_size() as u32, |max| max.clone() + 1)
|
||||||
|
};
|
||||||
// Make sure we modify the previous entry
|
// Make sure we modify the previous entry
|
||||||
self.added_tokens_map
|
self.added_tokens_map
|
||||||
.entry(token.content.clone())
|
.entry(token.content.clone())
|
||||||
.and_modify(|old_id| *old_id = id)
|
.and_modify(|old_id| *old_id = new_id)
|
||||||
.or_insert_with(|| id);
|
.or_insert_with(|| new_id);
|
||||||
if !self.special_tokens_set.contains(&token.content) {
|
|
||||||
self.added_tokens.push(token.clone());
|
|
||||||
}
|
|
||||||
// Update the current revert operation
|
// Update the current revert operation
|
||||||
self.added_tokens_map_r
|
self.added_tokens_map_r
|
||||||
.entry(id)
|
.entry(new_id)
|
||||||
.and_modify(|t| *t = token.clone())
|
.and_modify(|t| *t = token.clone())
|
||||||
.or_insert_with(|| token.clone());
|
.or_insert_with(|| token.clone());
|
||||||
|
// Make sure to remove previous entry (if the token gets a new id)
|
||||||
|
|
||||||
|
// Finally add the token to the classic set if special
|
||||||
|
if !self.special_tokens_set.contains(&token.content) {
|
||||||
|
self.added_tokens.push(token.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -654,13 +654,24 @@ mod tests {
|
|||||||
assert_eq!(
|
assert_eq!(
|
||||||
*vocab.get_vocab_r(),
|
*vocab.get_vocab_r(),
|
||||||
HashMap::from([
|
HashMap::from([
|
||||||
|
(0, AddedToken::from("test", true)),
|
||||||
(2, AddedToken::from("added_token_1", true)),
|
(2, AddedToken::from("added_token_1", true)),
|
||||||
(3, AddedToken::from("added_token_2", true)),
|
(3, AddedToken::from("added_token_2", true)),
|
||||||
(0, AddedToken::from("test", true)),
|
|
||||||
])
|
])
|
||||||
);
|
);
|
||||||
assert!(vocab.added_tokens_map.contains_key("test"));
|
assert!(vocab.added_tokens_map.contains_key("test"));
|
||||||
assert!(vocab.added_tokens_map_r.contains_key(&0));
|
assert!(vocab.added_tokens_map_r.contains_key(&0));
|
||||||
|
|
||||||
|
vocab.add_tokens(
|
||||||
|
&[
|
||||||
|
AddedToken::from("tost", true),
|
||||||
|
AddedToken::from("another_two", true),
|
||||||
|
],
|
||||||
|
&model,
|
||||||
|
normalizer,
|
||||||
|
);
|
||||||
|
assert_eq!(vocab.len(), 5); // New token was added
|
||||||
|
assert_eq!(vocab.get_vocab()["another_two"], 4); // New token was added, but the index is not the length of the vocab
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
Reference in New Issue
Block a user