mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 08:15:49 +00:00
chore: Remove CLI - this was originally intended for local development (#1442)
This commit is contained in:
@ -86,5 +86,5 @@ Simple checklist on how to make releases for `tokenizers`.
|
||||
If you want to make modifications to the CI/CD of the release GH actions, you need
|
||||
to :
|
||||
- **Comment the part that uploads the artifacts** to `crates.io`, `PyPi` or `npm`.
|
||||
- Change the trigger mecanism so it can trigger every time you push to your branch.
|
||||
- Change the trigger mechanism so it can trigger every time you push to your branch.
|
||||
- Keep pushing your changes until the artifacts are properly created.
|
||||
|
@ -20,12 +20,6 @@ name = "tokenizers"
|
||||
path = "src/lib.rs"
|
||||
bench = false
|
||||
|
||||
[[bin]]
|
||||
name = "cli"
|
||||
path = "src/cli.rs"
|
||||
bench = false
|
||||
required-features = ["cli"]
|
||||
|
||||
[[bench]]
|
||||
name = "bpe_benchmark"
|
||||
harness = false
|
||||
@ -52,14 +46,13 @@ rayon = "1.8"
|
||||
rayon-cond = "0.3"
|
||||
serde = { version = "1.0", features = [ "derive" ] }
|
||||
serde_json = "1.0"
|
||||
clap = { version = "4.4", features=["derive"], optional = true }
|
||||
unicode-normalization-alignments = "0.1"
|
||||
unicode_categories = "0.1"
|
||||
unicode-segmentation = "1.10"
|
||||
indicatif = {version = "0.17", optional = true}
|
||||
itertools = "0.12"
|
||||
log = "0.4"
|
||||
derive_builder = "0.12"
|
||||
derive_builder = "0.13"
|
||||
spm_precompiled = "0.1"
|
||||
hf-hub = { version = "0.3.2", optional = true }
|
||||
aho-corasick = "1.1"
|
||||
@ -72,11 +65,10 @@ esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
|
||||
monostate = "0.1.9"
|
||||
|
||||
[features]
|
||||
default = ["progressbar", "cli", "onig", "esaxx_fast"]
|
||||
default = ["progressbar", "onig", "esaxx_fast"]
|
||||
esaxx_fast = ["esaxx-rs/cpp"]
|
||||
progressbar = ["indicatif"]
|
||||
http = ["hf-hub"]
|
||||
cli = ["clap"]
|
||||
unstable_wasm = ["fancy-regex", "getrandom/js"]
|
||||
|
||||
[dev-dependencies]
|
||||
|
@ -1,73 +0,0 @@
|
||||
//!
|
||||
//! This is the CLI binary for the Tokenizers project
|
||||
//!
|
||||
|
||||
use clap::{Parser, Subcommand};
|
||||
use std::io::{self, BufRead, Write};
|
||||
use tokenizers::models::bpe::BPE;
|
||||
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||
use tokenizers::tokenizer::{AddedToken, Result};
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
/// Generate custom Tokenizers or use existing ones
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version)]
|
||||
struct Args {
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Command {
|
||||
Shell {
|
||||
/// Path to the vocab.json file
|
||||
vocab: String,
|
||||
/// Path to the merges.txt file
|
||||
merges: String,
|
||||
},
|
||||
}
|
||||
|
||||
fn shell(vocab: &str, merges: &str) -> Result<()> {
|
||||
let bpe = BPE::from_file(vocab, merges).build()?;
|
||||
let mut tokenizer = Tokenizer::new(bpe);
|
||||
tokenizer
|
||||
.with_pre_tokenizer(ByteLevel::default())
|
||||
.with_decoder(ByteLevel::default());
|
||||
|
||||
tokenizer.add_tokens(&[AddedToken::from(String::from("ing"), false).single_word(false)]);
|
||||
tokenizer
|
||||
.add_special_tokens(&[AddedToken::from(String::from("[ENT]"), true).single_word(true)]);
|
||||
|
||||
let stdin = io::stdin();
|
||||
let mut handle = stdin.lock();
|
||||
let mut buffer = String::new();
|
||||
|
||||
loop {
|
||||
buffer.clear();
|
||||
|
||||
print!("\nEnter some text to tokenize:\n> ");
|
||||
io::stdout().flush()?;
|
||||
handle.read_line(&mut buffer)?;
|
||||
let buffer = buffer.trim_end();
|
||||
|
||||
let timer = std::time::Instant::now();
|
||||
let encoded = tokenizer.encode(buffer.to_owned(), false)?;
|
||||
let elapsed = timer.elapsed();
|
||||
println!("\nInput:\t\t{}", buffer);
|
||||
println!("Tokens:\t\t{:?}", encoded.get_tokens());
|
||||
println!("IDs:\t\t{:?}", encoded.get_ids());
|
||||
println!("Offsets:\t{:?}", encoded.get_offsets());
|
||||
println!(
|
||||
"Decoded:\t{}",
|
||||
tokenizer.decode(encoded.get_ids(), true).unwrap()
|
||||
);
|
||||
println!("Tokenized in {:?}", elapsed);
|
||||
}
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let args = Args::parse();
|
||||
match args.command {
|
||||
Command::Shell { vocab, merges } => shell(&vocab, &merges),
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user