From 3bdb849bb37e07ecc0851253e0d03111e94134ea Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Wed, 11 Dec 2019 07:31:28 -0500 Subject: [PATCH] Fix cli + whitespace --- tokenizers/src/cli.rs | 10 +++++----- tokenizers/src/pre_tokenizers/whitespace.rs | 11 ++++++----- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/tokenizers/src/cli.rs b/tokenizers/src/cli.rs index 5ef60825..33905e91 100644 --- a/tokenizers/src/cli.rs +++ b/tokenizers/src/cli.rs @@ -4,11 +4,11 @@ use clap::{App, AppSettings, Arg, ArgMatches, SubCommand}; use std::io::{self, BufRead, Write}; -use tokenizers::models::bpe::{Error, BPE}; +use tokenizers::models::bpe::BPE; use tokenizers::pre_tokenizers::byte_level::ByteLevel; -use tokenizers::tokenizer::{EncodeInput, Tokenizer}; +use tokenizers::tokenizer::{EncodeInput, Result, Tokenizer}; -fn shell(matches: &ArgMatches) -> Result<(), Error> { +fn shell(matches: &ArgMatches) -> Result<()> { let vocab = matches .value_of("vocab") .expect("Must give a vocab.json file"); @@ -33,7 +33,7 @@ fn shell(matches: &ArgMatches) -> Result<(), Error> { let buffer = buffer.trim_end(); let timer = std::time::Instant::now(); - let encoded = tokenizer.encode(EncodeInput::Single(buffer.to_owned())); + let encoded = tokenizer.encode(EncodeInput::Single(buffer.to_owned()))?; let elapsed = timer.elapsed(); println!("\nInput:\t\t{}", buffer); println!("Tokens:\t\t{:?}", encoded.get_tokens()); @@ -43,7 +43,7 @@ fn shell(matches: &ArgMatches) -> Result<(), Error> { } } -fn main() -> Result<(), Error> { +fn main() -> Result<()> { let matches = App::new("tokenizers") .version("0.0.1") .author("Anthony M. ") diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 1e948c40..a3ed1232 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -1,13 +1,14 @@ -use crate::tokenizer::PreTokenizer; +use crate::tokenizer::{PreTokenizer, Result}; use regex::Regex; pub struct Whitespace; impl PreTokenizer for Whitespace { - fn pre_tokenize(&self, s: &str) -> Vec { + fn pre_tokenize(&self, s: &str) -> Result> { lazy_static! { static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap(); } - RE.captures_iter(s) + Ok(RE + .captures_iter(s) .map(|captures| { captures .iter() @@ -17,7 +18,7 @@ impl PreTokenizer for Whitespace { }) .collect() }) - .collect() + .collect()) } } @@ -37,7 +38,7 @@ mod tests { ]; let pretok = Whitespace; for (s, res) in tests { - assert_eq!(pretok.pre_tokenize(s), res); + assert_eq!(pretok.pre_tokenize(s).unwrap(), res); } } }