chore: Remove CLI - this was originally intended for local development (#1442)

This commit is contained in:
Bryant Biggs
2024-02-12 22:05:43 -05:00
committed by GitHub
parent 7f49f20ab0
commit 72a1973cd1
3 changed files with 3 additions and 84 deletions

View File

@ -86,5 +86,5 @@ Simple checklist on how to make releases for `tokenizers`.
If you want to make modifications to the CI/CD of the release GH actions, you need If you want to make modifications to the CI/CD of the release GH actions, you need
to : to :
- **Comment the part that uploads the artifacts** to `crates.io`, `PyPi` or `npm`. - **Comment the part that uploads the artifacts** to `crates.io`, `PyPi` or `npm`.
- Change the trigger mecanism so it can trigger every time you push to your branch. - Change the trigger mechanism so it can trigger every time you push to your branch.
- Keep pushing your changes until the artifacts are properly created. - Keep pushing your changes until the artifacts are properly created.

View File

@ -20,12 +20,6 @@ name = "tokenizers"
path = "src/lib.rs" path = "src/lib.rs"
bench = false bench = false
[[bin]]
name = "cli"
path = "src/cli.rs"
bench = false
required-features = ["cli"]
[[bench]] [[bench]]
name = "bpe_benchmark" name = "bpe_benchmark"
harness = false harness = false
@ -52,14 +46,13 @@ rayon = "1.8"
rayon-cond = "0.3" rayon-cond = "0.3"
serde = { version = "1.0", features = [ "derive" ] } serde = { version = "1.0", features = [ "derive" ] }
serde_json = "1.0" serde_json = "1.0"
clap = { version = "4.4", features=["derive"], optional = true }
unicode-normalization-alignments = "0.1" unicode-normalization-alignments = "0.1"
unicode_categories = "0.1" unicode_categories = "0.1"
unicode-segmentation = "1.10" unicode-segmentation = "1.10"
indicatif = {version = "0.17", optional = true} indicatif = {version = "0.17", optional = true}
itertools = "0.12" itertools = "0.12"
log = "0.4" log = "0.4"
derive_builder = "0.12" derive_builder = "0.13"
spm_precompiled = "0.1" spm_precompiled = "0.1"
hf-hub = { version = "0.3.2", optional = true } hf-hub = { version = "0.3.2", optional = true }
aho-corasick = "1.1" aho-corasick = "1.1"
@ -72,11 +65,10 @@ esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
monostate = "0.1.9" monostate = "0.1.9"
[features] [features]
default = ["progressbar", "cli", "onig", "esaxx_fast"] default = ["progressbar", "onig", "esaxx_fast"]
esaxx_fast = ["esaxx-rs/cpp"] esaxx_fast = ["esaxx-rs/cpp"]
progressbar = ["indicatif"] progressbar = ["indicatif"]
http = ["hf-hub"] http = ["hf-hub"]
cli = ["clap"]
unstable_wasm = ["fancy-regex", "getrandom/js"] unstable_wasm = ["fancy-regex", "getrandom/js"]
[dev-dependencies] [dev-dependencies]

View File

@ -1,73 +0,0 @@
//!
//! This is the CLI binary for the Tokenizers project
//!
use clap::{Parser, Subcommand};
use std::io::{self, BufRead, Write};
use tokenizers::models::bpe::BPE;
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::tokenizer::{AddedToken, Result};
use tokenizers::Tokenizer;
/// Generate custom Tokenizers or use existing ones
#[derive(Parser, Debug)]
#[command(author, version)]
struct Args {
#[command(subcommand)]
command: Command,
}
#[derive(Subcommand, Debug)]
enum Command {
Shell {
/// Path to the vocab.json file
vocab: String,
/// Path to the merges.txt file
merges: String,
},
}
fn shell(vocab: &str, merges: &str) -> Result<()> {
let bpe = BPE::from_file(vocab, merges).build()?;
let mut tokenizer = Tokenizer::new(bpe);
tokenizer
.with_pre_tokenizer(ByteLevel::default())
.with_decoder(ByteLevel::default());
tokenizer.add_tokens(&[AddedToken::from(String::from("ing"), false).single_word(false)]);
tokenizer
.add_special_tokens(&[AddedToken::from(String::from("[ENT]"), true).single_word(true)]);
let stdin = io::stdin();
let mut handle = stdin.lock();
let mut buffer = String::new();
loop {
buffer.clear();
print!("\nEnter some text to tokenize:\n> ");
io::stdout().flush()?;
handle.read_line(&mut buffer)?;
let buffer = buffer.trim_end();
let timer = std::time::Instant::now();
let encoded = tokenizer.encode(buffer.to_owned(), false)?;
let elapsed = timer.elapsed();
println!("\nInput:\t\t{}", buffer);
println!("Tokens:\t\t{:?}", encoded.get_tokens());
println!("IDs:\t\t{:?}", encoded.get_ids());
println!("Offsets:\t{:?}", encoded.get_offsets());
println!(
"Decoded:\t{}",
tokenizer.decode(encoded.get_ids(), true).unwrap()
);
println!("Tokenized in {:?}", elapsed);
}
}
fn main() -> Result<()> {
let args = Args::parse();
match args.command {
Command::Shell { vocab, merges } => shell(&vocab, &merges),
}
}