mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Expose vocabulary size
This commit is contained in:
@ -30,6 +30,11 @@ impl Tokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_vocab_size(&self) -> usize {
|
||||
self.tokenizer.get_vocab_size()
|
||||
}
|
||||
|
||||
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
|
||||
if let Some(model) = model.model.to_pointer() {
|
||||
self.tokenizer.with_model(model);
|
||||
|
@ -96,6 +96,10 @@ impl BPE {
|
||||
}
|
||||
|
||||
impl Model for BPE {
|
||||
fn get_vocab_size(&self) -> usize {
|
||||
self.vocab.len()
|
||||
}
|
||||
|
||||
fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
|
||||
if sentence.len() == 0 {
|
||||
return vec![];
|
||||
|
@ -48,6 +48,10 @@ impl WordPiece {
|
||||
}
|
||||
|
||||
impl Model for WordPiece {
|
||||
fn get_vocab_size(&self) -> usize {
|
||||
self.vocab.len()
|
||||
}
|
||||
|
||||
fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
|
||||
let mut output_tokens = vec![];
|
||||
|
||||
|
@ -37,6 +37,7 @@ pub trait Model {
|
||||
fn decode(&self, ids: Vec<u32>) -> Vec<String>;
|
||||
fn token_to_id(&self, token: &str) -> Option<u32>;
|
||||
fn id_to_token(&self, id: u32) -> Option<String>;
|
||||
fn get_vocab_size(&self) -> usize;
|
||||
}
|
||||
|
||||
/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
|
||||
@ -166,6 +167,11 @@ impl Tokenizer {
|
||||
self
|
||||
}
|
||||
|
||||
/// Get the size of the vocabulary
|
||||
pub fn get_vocab_size(&self) -> usize {
|
||||
self.model.get_vocab_size()
|
||||
}
|
||||
|
||||
/// Converts a token in the corresponding id.
|
||||
pub fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.model.token_to_id(token)
|
||||
|
Reference in New Issue
Block a user