Basic cli for testing

2025-08-22 16:25:30 +00:00 · 2019-11-18 15:47:35 -05:00
parent 4e5106989f
commit 872aa86b71
3 changed files with 85 additions and 9 deletions
--- a/tokenizers/Cargo.lock
+++ b/tokenizers/Cargo.lock
@ -469,6 +469,7 @@ dependencies = [
 name = "tokenizers-lib"
 version = "0.0.1"
 dependencies = [
+ "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "onig 5.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
 "rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/tokenizers/Cargo.toml
+++ b/tokenizers/Cargo.toml
@ -12,6 +12,7 @@ lazy_static = "1.3.0"
 onig = "5.0.0"
 rayon = "1.2.0"
 serde_json = "1.0"
+clap = "2.33.0"

 [lib]
 name = "tokenizers"
--- a/tokenizers/src/cli.rs
+++ b/tokenizers/src/cli.rs
@ -1,11 +1,85 @@
-/// This is the CLI binary for the Tokenizers project
-use tokenizers::WhitespaceTokenizer;
+//!
+//! This is the CLI binary for the Tokenizers project
+//!

-fn main() {
-    let s = "Hey man!";
-    println!(
-        "Tokenizing {:?} gives {:?}",
-        s,
-        WhitespaceTokenizer::tokenize(&s)
-    );
+use clap::{App, AppSettings, Arg, ArgMatches, SubCommand};
+use std::io::{self, BufRead, Write};
+use tokenizers::models::bpe::{Error, BPE};
+use tokenizers::pre_tokenizers::byte_level::ByteLevel;
+use tokenizers::tokenizer::Tokenizer;
+
+fn shell(matches: &ArgMatches) -> Result<(), Error> {
+    let vocab = matches
+        .value_of("vocab")
+        .expect("Must give a vocab.json file");
+    let merges = matches
+        .value_of("merges")
+        .expect("Must give a merges.txt file");
+
+    let bpe = BPE::from_files(vocab, merges)?;
+    let mut tokenizer = Tokenizer::new(Box::new(bpe));
+    tokenizer.with_pre_tokenizer(Box::new(ByteLevel));
+
+    let stdin = io::stdin();
+    let mut handle = stdin.lock();
+    let mut buffer = String::new();
+
+    loop {
+        buffer.clear();
+
+        print!("\nEnter some text to tokenize:\n>  ");
+        io::stdout().flush()?;
+        handle.read_line(&mut buffer)?;
+        let buffer = buffer.trim_end();
+
+        let timer = std::time::Instant::now();
+        let encoded = tokenizer.encode(buffer);
+        let elapsed = timer.elapsed();
+        println!("\nInput:\t\t{}", buffer);
+        println!(
+            "Tokens:\t\t{:?}",
+            encoded.iter().map(|t| &t.value).collect::<Vec<_>>()
+        );
+        println!(
+            "IDs:\t\t{:?}",
+            encoded.iter().map(|t| t.id).collect::<Vec<_>>()
+        );
+        println!(
+            "Offsets:\t{:?}",
+            encoded.iter().map(|t| t.offsets).collect::<Vec<_>>()
+        );
+        println!("Tokenized in {:?}", elapsed);
+    }
+}
+
+fn main() -> Result<(), Error> {
+    let matches = App::new("tokenizers")
+        .version("0.0.1")
+        .author("Anthony M. <anthony@huggingface.co>")
+        .about("Generate custom Tokenizers or use existing ones")
+        .setting(AppSettings::SubcommandRequiredElseHelp)
+        .subcommand(
+            SubCommand::with_name("shell")
+                .about("Interactively test a tokenizer")
+                .arg(
+                    Arg::with_name("vocab")
+                        .long("vocab")
+                        .value_name("VOCAB_FILE")
+                        .help("Path to the vocab.json file")
+                        .required(true),
+                )
+                .arg(
+                    Arg::with_name("merges")
+                        .long("merges")
+                        .value_name("MERGES_FILE")
+                        .help("Path to the merges.txt file")
+                        .required(true),
+                ),
+        )
+        .get_matches();
+
+    match matches.subcommand() {
+        ("shell", matches) => shell(matches.unwrap()),
+        (subcommand, _) => panic!("Unknown subcommand {}", subcommand),
+    }
 }