mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Models handles offsets
This commit is contained in:
@ -32,6 +32,7 @@ pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + S
|
|||||||
pub trait Normalizer {
|
pub trait Normalizer {
|
||||||
fn normalize(&self, s: String) -> Result<String>;
|
fn normalize(&self, s: String) -> Result<String>;
|
||||||
}
|
}
|
||||||
|
pub type Offsets = (usize, usize);
|
||||||
|
|
||||||
/// A PreTokenizer takes care of pre-tokenizing strings before this goes to the model
|
/// A PreTokenizer takes care of pre-tokenizing strings before this goes to the model
|
||||||
pub trait PreTokenizer {
|
pub trait PreTokenizer {
|
||||||
@ -41,7 +42,7 @@ pub trait PreTokenizer {
|
|||||||
|
|
||||||
/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
|
/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
|
||||||
pub trait Model {
|
pub trait Model {
|
||||||
fn tokenize(&self, tokens: Vec<String>) -> Result<Vec<Token>>;
|
fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>;
|
||||||
fn token_to_id(&self, token: &str) -> Option<u32>;
|
fn token_to_id(&self, token: &str) -> Option<u32>;
|
||||||
fn id_to_token(&self, id: u32) -> Option<String>;
|
fn id_to_token(&self, id: u32) -> Option<String>;
|
||||||
fn get_vocab_size(&self) -> usize;
|
fn get_vocab_size(&self) -> usize;
|
||||||
|
Reference in New Issue
Block a user