Models handles offsets

This commit is contained in:
Anthony MOI
2019-12-28 15:21:29 -05:00
parent 839239d3b4
commit 5d9848ad6c

View File

@ -32,6 +32,7 @@ pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + S
pub trait Normalizer { pub trait Normalizer {
fn normalize(&self, s: String) -> Result<String>; fn normalize(&self, s: String) -> Result<String>;
} }
pub type Offsets = (usize, usize);
/// A PreTokenizer takes care of pre-tokenizing strings before this goes to the model /// A PreTokenizer takes care of pre-tokenizing strings before this goes to the model
pub trait PreTokenizer { pub trait PreTokenizer {
@ -41,7 +42,7 @@ pub trait PreTokenizer {
/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram) /// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
pub trait Model { pub trait Model {
fn tokenize(&self, tokens: Vec<String>) -> Result<Vec<Token>>; fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>;
fn token_to_id(&self, token: &str) -> Option<u32>; fn token_to_id(&self, token: &str) -> Option<u32>;
fn id_to_token(&self, id: u32) -> Option<String>; fn id_to_token(&self, id: u32) -> Option<String>;
fn get_vocab_size(&self) -> usize; fn get_vocab_size(&self) -> usize;