mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Disable caching for long strings. (#1676)
This commit is contained in:
@ -1,6 +1,6 @@
|
|||||||
use super::{super::OrderedVocabIter, trainer::BpeTrainer, Error, Pair, Word};
|
use super::{super::OrderedVocabIter, trainer::BpeTrainer, Error, Pair, Word};
|
||||||
use crate::tokenizer::{Model, Result, Token};
|
use crate::tokenizer::{Model, Result, Token};
|
||||||
use crate::utils::cache::{Cache, DEFAULT_CACHE_CAPACITY};
|
use crate::utils::cache::{Cache, DEFAULT_CACHE_CAPACITY, MAX_LENGTH};
|
||||||
use crate::utils::iter::ResultShunt;
|
use crate::utils::iter::ResultShunt;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
@ -482,7 +482,9 @@ impl BPE {
|
|||||||
let word = self.merge_word(sequence)?;
|
let word = self.merge_word(sequence)?;
|
||||||
let ret = self.word_to_tokens(&word).collect();
|
let ret = self.word_to_tokens(&word).collect();
|
||||||
if let Some(ref cache) = self.cache {
|
if let Some(ref cache) = self.cache {
|
||||||
cache.set(sequence.to_owned(), word);
|
if sequence.len() < MAX_LENGTH {
|
||||||
|
cache.set(sequence.to_owned(), word);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Ok(ret)
|
Ok(ret)
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,7 @@ use super::{
|
|||||||
trie::{Trie, TrieBuilder},
|
trie::{Trie, TrieBuilder},
|
||||||
};
|
};
|
||||||
use crate::tokenizer::{Model, Result, Token};
|
use crate::tokenizer::{Model, Result, Token};
|
||||||
use crate::utils::cache::Cache;
|
use crate::utils::cache::{Cache, MAX_LENGTH};
|
||||||
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
@ -230,7 +230,9 @@ impl Unigram {
|
|||||||
} else {
|
} else {
|
||||||
self.encode_unoptimized(sentence)?
|
self.encode_unoptimized(sentence)?
|
||||||
};
|
};
|
||||||
self.cache.set(sentence.to_owned(), result.clone());
|
if sentence.len() < MAX_LENGTH {
|
||||||
|
self.cache.set(sentence.to_owned(), result.clone());
|
||||||
|
}
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,9 @@ use std::sync::RwLock;
|
|||||||
|
|
||||||
/// The default capacity for a `BPE`'s internal cache.
|
/// The default capacity for a `BPE`'s internal cache.
|
||||||
pub static DEFAULT_CACHE_CAPACITY: usize = 10_000;
|
pub static DEFAULT_CACHE_CAPACITY: usize = 10_000;
|
||||||
|
/// The maximum length we should cache in a model
|
||||||
|
/// Strings that are too long have minimal chances to cache hit anyway
|
||||||
|
pub static MAX_LENGTH: usize = 256;
|
||||||
|
|
||||||
/// Provides a simple multithread cache to speed up BPE tokenization that will try to read values
|
/// Provides a simple multithread cache to speed up BPE tokenization that will try to read values
|
||||||
/// concurrently but won't block if another thread is writing.
|
/// concurrently but won't block if another thread is writing.
|
||||||
|
Reference in New Issue
Block a user