mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
make cache optional (#37)
This commit is contained in:
@ -43,7 +43,7 @@ impl BpeBuilder {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the cache's capacity.
|
/// Set the cache's capacity. If the capacity is set to 0, no cache will be used.
|
||||||
pub fn cache_capacity(mut self, capacity: usize) -> Self {
|
pub fn cache_capacity(mut self, capacity: usize) -> Self {
|
||||||
self.config.cache_capacity = Some(capacity);
|
self.config.cache_capacity = Some(capacity);
|
||||||
self
|
self
|
||||||
@ -80,8 +80,9 @@ impl BpeBuilder {
|
|||||||
};
|
};
|
||||||
let merges = self.config.merges.unwrap_or_else(HashMap::new);
|
let merges = self.config.merges.unwrap_or_else(HashMap::new);
|
||||||
let cache = match self.config.cache_capacity {
|
let cache = match self.config.cache_capacity {
|
||||||
Some(capacity) => Cache::new(capacity),
|
Some(0) => None,
|
||||||
None => Cache::default(),
|
Some(capacity) => Some(Cache::new(capacity)),
|
||||||
|
None => Some(Cache::default()),
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(BPE {
|
Ok(BPE {
|
||||||
@ -104,7 +105,7 @@ pub struct BPE {
|
|||||||
/// Contains the mapping between Pairs and their (rank, new_id).
|
/// Contains the mapping between Pairs and their (rank, new_id).
|
||||||
merges: HashMap<Pair, (u32, u32)>,
|
merges: HashMap<Pair, (u32, u32)>,
|
||||||
/// Contains the cache for optimizing the encoding step.
|
/// Contains the cache for optimizing the encoding step.
|
||||||
cache: Cache<String, Word>,
|
cache: Option<Cache<String, Word>>,
|
||||||
/// Dropout probability for merges. 0 = no dropout is the default. At 1.0, tokenization will
|
/// Dropout probability for merges. 0 = no dropout is the default. At 1.0, tokenization will
|
||||||
/// perform no merges, so the result will just be characters.
|
/// perform no merges, so the result will just be characters.
|
||||||
dropout: Option<f32>,
|
dropout: Option<f32>,
|
||||||
@ -122,11 +123,15 @@ impl Clone for BPE {
|
|||||||
// `Clone` can't be derive because it's not implemented for `Cache`.
|
// `Clone` can't be derive because it's not implemented for `Cache`.
|
||||||
// To keep things simple when we clone, the new BPE will start with a fresh cache.
|
// To keep things simple when we clone, the new BPE will start with a fresh cache.
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
|
let fresh_cache = match self.cache {
|
||||||
|
Some(ref cache) => Some(cache.fresh()),
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
Self {
|
Self {
|
||||||
vocab: self.vocab.clone(),
|
vocab: self.vocab.clone(),
|
||||||
vocab_r: self.vocab_r.clone(),
|
vocab_r: self.vocab_r.clone(),
|
||||||
merges: self.merges.clone(),
|
merges: self.merges.clone(),
|
||||||
cache: self.cache.fresh(),
|
cache: fresh_cache,
|
||||||
dropout: self.dropout,
|
dropout: self.dropout,
|
||||||
unk_token: self.unk_token,
|
unk_token: self.unk_token,
|
||||||
}
|
}
|
||||||
@ -205,7 +210,9 @@ impl BPE {
|
|||||||
|
|
||||||
/// Reset the cache.
|
/// Reset the cache.
|
||||||
pub fn clear_cache(&self) {
|
pub fn clear_cache(&self) {
|
||||||
self.cache.clear()
|
if let Some(ref cache) = self.cache {
|
||||||
|
cache.clear()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_word(&self, w: &str) -> Word {
|
fn merge_word(&self, w: &str) -> Word {
|
||||||
@ -289,10 +296,11 @@ impl Model for BPE {
|
|||||||
|
|
||||||
let mut encoded: Vec<Token> = Vec::with_capacity(sentence.len());
|
let mut encoded: Vec<Token> = Vec::with_capacity(sentence.len());
|
||||||
let mut cached_words = match self.dropout {
|
let mut cached_words = match self.dropout {
|
||||||
None => self
|
None => match self.cache {
|
||||||
.cache
|
Some(ref cache) => cache.get_values(sentence.iter().map(|(s, _)| s.clone())),
|
||||||
.get_values(sentence.iter().map(|(s, _)| s.clone())),
|
None => None,
|
||||||
Some(_) => None, // If using dropout we don't want to use a cached.
|
},
|
||||||
|
Some(_) => None, // If using dropout we don't want to use the cache.
|
||||||
};
|
};
|
||||||
|
|
||||||
for (i, (w, initial_offsets)) in sentence.iter().enumerate() {
|
for (i, (w, initial_offsets)) in sentence.iter().enumerate() {
|
||||||
@ -323,7 +331,10 @@ impl Model for BPE {
|
|||||||
// Also update cache
|
// Also update cache
|
||||||
if let Some(cache) = cached_words {
|
if let Some(cache) = cached_words {
|
||||||
let keys_iter = sentence.into_iter().map(|(s, _)| s);
|
let keys_iter = sentence.into_iter().map(|(s, _)| s);
|
||||||
self.cache.set_values(keys_iter, cache.into_iter());
|
self.cache
|
||||||
|
.as_ref()
|
||||||
|
.unwrap()
|
||||||
|
.set_values(keys_iter, cache.into_iter());
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(encoded)
|
Ok(encoded)
|
||||||
|
Reference in New Issue
Block a user