Whitespace PreTokenizer handles offsets

This commit is contained in:
Anthony MOI
2019-12-28 15:50:42 -05:00
parent be00a1e45e
commit 35a8dfdd55

View File

@ -1,9 +1,9 @@
use crate::tokenizer::{PreTokenizer, Result};
use crate::tokenizer::{Offsets, PreTokenizer, Result};
use regex::Regex;
pub struct Whitespace;
impl PreTokenizer for Whitespace {
fn pre_tokenize(&self, s: &str) -> Result<Vec<String>> {
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
lazy_static! {
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
}
@ -13,11 +13,15 @@ impl PreTokenizer for Whitespace {
captures
.iter()
.map(|m| {
m.map(|capture| s[capture.start()..capture.end()].to_owned())
.unwrap_or_else(|| String::from(""))
m.map(|capture| {
let (start, end) = (capture.start(), capture.end());
(s[start..end].to_owned(), (start, end))
})
.collect()
.unwrap_or_else(|| (String::from(""), (0, 0)))
})
.collect::<Vec<(String, Offsets)>>()
})
.flatten()
.collect())
}
}
@ -30,10 +34,23 @@ mod tests {
#[test]
fn basic() {
let tests = vec![
("Hey man!", vec!["Hey", "man", "!"]),
(
"Hey man!",
vec![
("Hey".into(), (0, 3)),
("man".into(), (4, 7)),
("!".into(), (7, 8)),
],
),
(
"How are you doing?",
vec!["How", "are", "you", "doing", "?"],
vec![
("How".into(), (0, 3)),
("are".into(), (4, 7)),
("you".into(), (8, 11)),
("doing".into(), (12, 17)),
("?".into(), (17, 18)),
],
),
];
let pretok = Whitespace;