mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Expose vocabulary size
This commit is contained in:
@ -30,6 +30,11 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[getter]
|
||||||
|
fn get_vocab_size(&self) -> usize {
|
||||||
|
self.tokenizer.get_vocab_size()
|
||||||
|
}
|
||||||
|
|
||||||
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
|
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
|
||||||
if let Some(model) = model.model.to_pointer() {
|
if let Some(model) = model.model.to_pointer() {
|
||||||
self.tokenizer.with_model(model);
|
self.tokenizer.with_model(model);
|
||||||
|
@ -96,6 +96,10 @@ impl BPE {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Model for BPE {
|
impl Model for BPE {
|
||||||
|
fn get_vocab_size(&self) -> usize {
|
||||||
|
self.vocab.len()
|
||||||
|
}
|
||||||
|
|
||||||
fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
|
fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
|
||||||
if sentence.len() == 0 {
|
if sentence.len() == 0 {
|
||||||
return vec![];
|
return vec![];
|
||||||
|
@ -48,6 +48,10 @@ impl WordPiece {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Model for WordPiece {
|
impl Model for WordPiece {
|
||||||
|
fn get_vocab_size(&self) -> usize {
|
||||||
|
self.vocab.len()
|
||||||
|
}
|
||||||
|
|
||||||
fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
|
fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
|
||||||
let mut output_tokens = vec![];
|
let mut output_tokens = vec![];
|
||||||
|
|
||||||
|
@ -37,6 +37,7 @@ pub trait Model {
|
|||||||
fn decode(&self, ids: Vec<u32>) -> Vec<String>;
|
fn decode(&self, ids: Vec<u32>) -> Vec<String>;
|
||||||
fn token_to_id(&self, token: &str) -> Option<u32>;
|
fn token_to_id(&self, token: &str) -> Option<u32>;
|
||||||
fn id_to_token(&self, id: u32) -> Option<String>;
|
fn id_to_token(&self, id: u32) -> Option<String>;
|
||||||
|
fn get_vocab_size(&self) -> usize;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
|
/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
|
||||||
@ -166,6 +167,11 @@ impl Tokenizer {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Get the size of the vocabulary
|
||||||
|
pub fn get_vocab_size(&self) -> usize {
|
||||||
|
self.model.get_vocab_size()
|
||||||
|
}
|
||||||
|
|
||||||
/// Converts a token in the corresponding id.
|
/// Converts a token in the corresponding id.
|
||||||
pub fn token_to_id(&self, token: &str) -> Option<u32> {
|
pub fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||||
self.model.token_to_id(token)
|
self.model.token_to_id(token)
|
||||||
|
Reference in New Issue
Block a user