BPE handles offsets

This commit is contained in:
Anthony MOI
2019-12-28 15:21:50 -05:00
parent 5d9848ad6c
commit 5682627223

View File

@ -1,5 +1,5 @@
use super::{Cache, Error, Pair, Word}; use super::{Cache, Error, Pair, Word};
use crate::tokenizer::{Model, Result, Token}; use crate::tokenizer::{Model, Offsets, Result, Token};
use serde_json::Value; use serde_json::Value;
use std::{ use std::{
collections::HashMap, collections::HashMap,
@ -103,15 +103,20 @@ impl Model for BPE {
self.vocab.len() self.vocab.len()
} }
fn tokenize(&self, sentence: Vec<String>) -> Result<Vec<Token>> { fn tokenize(&self, sentence: Vec<(String, Offsets)>) -> Result<Vec<Token>> {
if sentence.is_empty() { if sentence.is_empty() {
return Ok(vec![]); return Ok(vec![]);
} }
let mut encoded: Vec<Token> = Vec::with_capacity(sentence.len()); let mut encoded: Vec<Token> = Vec::with_capacity(sentence.len());
let mut cached_words = self.cache.get_values(&sentence); let mut cached_words = self.cache.get_values(
&sentence
.iter()
.map(|(s, _)| s.to_owned())
.collect::<Vec<_>>(),
);
for (i, w) in sentence.iter().enumerate() { for (i, (w, initial_offsets)) in sentence.iter().enumerate() {
if cached_words[i].is_none() { if cached_words[i].is_none() {
let mut word = Word::new(); let mut word = Word::new();
for c in w.chars() { for c in w.chars() {
@ -155,9 +160,6 @@ impl Model for BPE {
cached_words[i] = Some(word); cached_words[i] = Some(word);
} }
// Offsets are word-based, we need to translate them to be sentence-based
let last_offset = encoded.last().map(|token| token.offsets.1).unwrap_or(0);
let word = cached_words[i].as_ref().unwrap(); let word = cached_words[i].as_ref().unwrap();
let tokens = word let tokens = word
.get_chars() .get_chars()
@ -167,7 +169,7 @@ impl Model for BPE {
Token::new( Token::new(
*id, *id,
self.vocab_r[id].clone(), self.vocab_r[id].clone(),
(last_offset + offsets.0, last_offset + offsets.1), (initial_offsets.0 + offsets.0, initial_offsets.0 + offsets.1),
) )
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
@ -180,7 +182,7 @@ impl Model for BPE {
.into_iter() .into_iter()
.zip(cached_words) .zip(cached_words)
.filter(|(_, v)| v.is_some()) .filter(|(_, v)| v.is_some())
.map(|(k, v)| (k, v.unwrap())) .map(|(k, v)| (k.0, v.unwrap()))
.unzip::<_, _, Vec<String>, Vec<Word>>(); .unzip::<_, _, Vec<String>, Vec<Word>>();
self.cache.set_values(keys, values); self.cache.set_values(keys, values);