mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Whitespace PreTokenizer handles offsets
This commit is contained in:
@ -1,9 +1,9 @@
|
||||
use crate::tokenizer::{PreTokenizer, Result};
|
||||
use crate::tokenizer::{Offsets, PreTokenizer, Result};
|
||||
use regex::Regex;
|
||||
|
||||
pub struct Whitespace;
|
||||
impl PreTokenizer for Whitespace {
|
||||
fn pre_tokenize(&self, s: &str) -> Result<Vec<String>> {
|
||||
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
|
||||
lazy_static! {
|
||||
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
|
||||
}
|
||||
@ -13,11 +13,15 @@ impl PreTokenizer for Whitespace {
|
||||
captures
|
||||
.iter()
|
||||
.map(|m| {
|
||||
m.map(|capture| s[capture.start()..capture.end()].to_owned())
|
||||
.unwrap_or_else(|| String::from(""))
|
||||
m.map(|capture| {
|
||||
let (start, end) = (capture.start(), capture.end());
|
||||
(s[start..end].to_owned(), (start, end))
|
||||
})
|
||||
.collect()
|
||||
.unwrap_or_else(|| (String::from(""), (0, 0)))
|
||||
})
|
||||
.collect::<Vec<(String, Offsets)>>()
|
||||
})
|
||||
.flatten()
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
@ -30,10 +34,23 @@ mod tests {
|
||||
#[test]
|
||||
fn basic() {
|
||||
let tests = vec![
|
||||
("Hey man!", vec!["Hey", "man", "!"]),
|
||||
(
|
||||
"Hey man!",
|
||||
vec![
|
||||
("Hey".into(), (0, 3)),
|
||||
("man".into(), (4, 7)),
|
||||
("!".into(), (7, 8)),
|
||||
],
|
||||
),
|
||||
(
|
||||
"How are you doing?",
|
||||
vec!["How", "are", "you", "doing", "?"],
|
||||
vec![
|
||||
("How".into(), (0, 3)),
|
||||
("are".into(), (4, 7)),
|
||||
("you".into(), (8, 11)),
|
||||
("doing".into(), (12, 17)),
|
||||
("?".into(), (17, 18)),
|
||||
],
|
||||
),
|
||||
];
|
||||
let pretok = Whitespace;
|
||||
|
Reference in New Issue
Block a user