From 872aa86b71a361941f820be94e9e299c21465a86 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Mon, 18 Nov 2019 15:47:35 -0500 Subject: [PATCH] Basic cli for testing --- tokenizers/Cargo.lock | 1 + tokenizers/Cargo.toml | 1 + tokenizers/src/cli.rs | 92 ++++++++++++++++++++++++++++++++++++++----- 3 files changed, 85 insertions(+), 9 deletions(-) diff --git a/tokenizers/Cargo.lock b/tokenizers/Cargo.lock index 2e025672..3edbb8da 100644 --- a/tokenizers/Cargo.lock +++ b/tokenizers/Cargo.lock @@ -469,6 +469,7 @@ dependencies = [ name = "tokenizers-lib" version = "0.0.1" dependencies = [ + "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "onig 5.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index ec667da9..b30f0bf6 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -12,6 +12,7 @@ lazy_static = "1.3.0" onig = "5.0.0" rayon = "1.2.0" serde_json = "1.0" +clap = "2.33.0" [lib] name = "tokenizers" diff --git a/tokenizers/src/cli.rs b/tokenizers/src/cli.rs index d936e287..1dbd3ccd 100644 --- a/tokenizers/src/cli.rs +++ b/tokenizers/src/cli.rs @@ -1,11 +1,85 @@ -/// This is the CLI binary for the Tokenizers project -use tokenizers::WhitespaceTokenizer; +//! +//! This is the CLI binary for the Tokenizers project +//! -fn main() { - let s = "Hey man!"; - println!( - "Tokenizing {:?} gives {:?}", - s, - WhitespaceTokenizer::tokenize(&s) - ); +use clap::{App, AppSettings, Arg, ArgMatches, SubCommand}; +use std::io::{self, BufRead, Write}; +use tokenizers::models::bpe::{Error, BPE}; +use tokenizers::pre_tokenizers::byte_level::ByteLevel; +use tokenizers::tokenizer::Tokenizer; + +fn shell(matches: &ArgMatches) -> Result<(), Error> { + let vocab = matches + .value_of("vocab") + .expect("Must give a vocab.json file"); + let merges = matches + .value_of("merges") + .expect("Must give a merges.txt file"); + + let bpe = BPE::from_files(vocab, merges)?; + let mut tokenizer = Tokenizer::new(Box::new(bpe)); + tokenizer.with_pre_tokenizer(Box::new(ByteLevel)); + + let stdin = io::stdin(); + let mut handle = stdin.lock(); + let mut buffer = String::new(); + + loop { + buffer.clear(); + + print!("\nEnter some text to tokenize:\n> "); + io::stdout().flush()?; + handle.read_line(&mut buffer)?; + let buffer = buffer.trim_end(); + + let timer = std::time::Instant::now(); + let encoded = tokenizer.encode(buffer); + let elapsed = timer.elapsed(); + println!("\nInput:\t\t{}", buffer); + println!( + "Tokens:\t\t{:?}", + encoded.iter().map(|t| &t.value).collect::>() + ); + println!( + "IDs:\t\t{:?}", + encoded.iter().map(|t| t.id).collect::>() + ); + println!( + "Offsets:\t{:?}", + encoded.iter().map(|t| t.offsets).collect::>() + ); + println!("Tokenized in {:?}", elapsed); + } +} + +fn main() -> Result<(), Error> { + let matches = App::new("tokenizers") + .version("0.0.1") + .author("Anthony M. ") + .about("Generate custom Tokenizers or use existing ones") + .setting(AppSettings::SubcommandRequiredElseHelp) + .subcommand( + SubCommand::with_name("shell") + .about("Interactively test a tokenizer") + .arg( + Arg::with_name("vocab") + .long("vocab") + .value_name("VOCAB_FILE") + .help("Path to the vocab.json file") + .required(true), + ) + .arg( + Arg::with_name("merges") + .long("merges") + .value_name("MERGES_FILE") + .help("Path to the merges.txt file") + .required(true), + ), + ) + .get_matches(); + + match matches.subcommand() { + ("shell", matches) => shell(matches.unwrap()), + (subcommand, _) => panic!("Unknown subcommand {}", subcommand), + } }