mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Wordpiece handles prefix customization
This commit is contained in:
@ -245,6 +245,10 @@ impl BPE {
|
||||
&self.unk_token
|
||||
}
|
||||
|
||||
pub fn get_continuing_subword_prefix(&self) -> &Option<String> {
|
||||
&self.continuing_subword_prefix
|
||||
}
|
||||
|
||||
fn merge_word(&self, w: &str) -> Word {
|
||||
let mut word = Word::new();
|
||||
for (is_first, is_last, c) in w.chars().with_first_and_last() {
|
||||
|
@ -28,6 +28,7 @@ impl fmt::Display for Error {
|
||||
|
||||
pub struct WordPiece {
|
||||
unk_token: String,
|
||||
continuing_subword_prefix: String,
|
||||
max_input_chars_per_word: usize,
|
||||
vocab: HashMap<String, u32>,
|
||||
vocab_r: HashMap<u32, String>,
|
||||
@ -38,6 +39,7 @@ impl Default for WordPiece {
|
||||
WordPiece {
|
||||
vocab: HashMap::new(),
|
||||
vocab_r: HashMap::new(),
|
||||
continuing_subword_prefix: String::from("##"),
|
||||
unk_token: String::from("[UNK]"),
|
||||
max_input_chars_per_word: 100,
|
||||
}
|
||||
@ -64,6 +66,7 @@ impl WordPiece {
|
||||
vocab_r: vocab.into_iter().map(|(token, id)| (id, token)).collect(),
|
||||
unk_token,
|
||||
max_input_chars_per_word: max_input_chars_per_word.unwrap_or(100),
|
||||
..Default::default()
|
||||
})
|
||||
}
|
||||
|
||||
@ -86,6 +89,9 @@ impl WordPiece {
|
||||
wp.unk_token = unk_token.to_owned();
|
||||
}
|
||||
}
|
||||
if let Some(prefix) = bpe.get_continuing_subword_prefix() {
|
||||
wp.continuing_subword_prefix = prefix.to_owned();
|
||||
}
|
||||
|
||||
wp
|
||||
}
|
||||
@ -125,7 +131,7 @@ impl Model for WordPiece {
|
||||
while start < end {
|
||||
let mut substr = chars[start..end].iter().collect::<String>();
|
||||
if start > 0 {
|
||||
substr = format!("##{}", substr);
|
||||
substr = format!("{}{}", self.continuing_subword_prefix, substr);
|
||||
}
|
||||
if self.vocab.contains_key(&substr) {
|
||||
cur_str = Some(Token {
|
||||
|
Reference in New Issue
Block a user