Wordpiece handles prefix customization

This commit is contained in:
Anthony MOI
2020-01-03 15:30:55 -05:00
parent dc8266236d
commit 02a89bb07f
2 changed files with 11 additions and 1 deletions

View File

@ -245,6 +245,10 @@ impl BPE {
&self.unk_token
}
pub fn get_continuing_subword_prefix(&self) -> &Option<String> {
&self.continuing_subword_prefix
}
fn merge_word(&self, w: &str) -> Word {
let mut word = Word::new();
for (is_first, is_last, c) in w.chars().with_first_and_last() {

View File

@ -28,6 +28,7 @@ impl fmt::Display for Error {
pub struct WordPiece {
unk_token: String,
continuing_subword_prefix: String,
max_input_chars_per_word: usize,
vocab: HashMap<String, u32>,
vocab_r: HashMap<u32, String>,
@ -38,6 +39,7 @@ impl Default for WordPiece {
WordPiece {
vocab: HashMap::new(),
vocab_r: HashMap::new(),
continuing_subword_prefix: String::from("##"),
unk_token: String::from("[UNK]"),
max_input_chars_per_word: 100,
}
@ -64,6 +66,7 @@ impl WordPiece {
vocab_r: vocab.into_iter().map(|(token, id)| (id, token)).collect(),
unk_token,
max_input_chars_per_word: max_input_chars_per_word.unwrap_or(100),
..Default::default()
})
}
@ -86,6 +89,9 @@ impl WordPiece {
wp.unk_token = unk_token.to_owned();
}
}
if let Some(prefix) = bpe.get_continuing_subword_prefix() {
wp.continuing_subword_prefix = prefix.to_owned();
}
wp
}
@ -125,7 +131,7 @@ impl Model for WordPiece {
while start < end {
let mut substr = chars[start..end].iter().collect::<String>();
if start > 0 {
substr = format!("##{}", substr);
substr = format!("{}{}", self.continuing_subword_prefix, substr);
}
if self.vocab.contains_key(&substr) {
cur_str = Some(Token {