mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 08:15:49 +00:00
chore: Remove CLI - this was originally intended for local development (#1442)
This commit is contained in:
@ -86,5 +86,5 @@ Simple checklist on how to make releases for `tokenizers`.
|
|||||||
If you want to make modifications to the CI/CD of the release GH actions, you need
|
If you want to make modifications to the CI/CD of the release GH actions, you need
|
||||||
to :
|
to :
|
||||||
- **Comment the part that uploads the artifacts** to `crates.io`, `PyPi` or `npm`.
|
- **Comment the part that uploads the artifacts** to `crates.io`, `PyPi` or `npm`.
|
||||||
- Change the trigger mecanism so it can trigger every time you push to your branch.
|
- Change the trigger mechanism so it can trigger every time you push to your branch.
|
||||||
- Keep pushing your changes until the artifacts are properly created.
|
- Keep pushing your changes until the artifacts are properly created.
|
||||||
|
@ -20,12 +20,6 @@ name = "tokenizers"
|
|||||||
path = "src/lib.rs"
|
path = "src/lib.rs"
|
||||||
bench = false
|
bench = false
|
||||||
|
|
||||||
[[bin]]
|
|
||||||
name = "cli"
|
|
||||||
path = "src/cli.rs"
|
|
||||||
bench = false
|
|
||||||
required-features = ["cli"]
|
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "bpe_benchmark"
|
name = "bpe_benchmark"
|
||||||
harness = false
|
harness = false
|
||||||
@ -52,14 +46,13 @@ rayon = "1.8"
|
|||||||
rayon-cond = "0.3"
|
rayon-cond = "0.3"
|
||||||
serde = { version = "1.0", features = [ "derive" ] }
|
serde = { version = "1.0", features = [ "derive" ] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
clap = { version = "4.4", features=["derive"], optional = true }
|
|
||||||
unicode-normalization-alignments = "0.1"
|
unicode-normalization-alignments = "0.1"
|
||||||
unicode_categories = "0.1"
|
unicode_categories = "0.1"
|
||||||
unicode-segmentation = "1.10"
|
unicode-segmentation = "1.10"
|
||||||
indicatif = {version = "0.17", optional = true}
|
indicatif = {version = "0.17", optional = true}
|
||||||
itertools = "0.12"
|
itertools = "0.12"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
derive_builder = "0.12"
|
derive_builder = "0.13"
|
||||||
spm_precompiled = "0.1"
|
spm_precompiled = "0.1"
|
||||||
hf-hub = { version = "0.3.2", optional = true }
|
hf-hub = { version = "0.3.2", optional = true }
|
||||||
aho-corasick = "1.1"
|
aho-corasick = "1.1"
|
||||||
@ -72,11 +65,10 @@ esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
|
|||||||
monostate = "0.1.9"
|
monostate = "0.1.9"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["progressbar", "cli", "onig", "esaxx_fast"]
|
default = ["progressbar", "onig", "esaxx_fast"]
|
||||||
esaxx_fast = ["esaxx-rs/cpp"]
|
esaxx_fast = ["esaxx-rs/cpp"]
|
||||||
progressbar = ["indicatif"]
|
progressbar = ["indicatif"]
|
||||||
http = ["hf-hub"]
|
http = ["hf-hub"]
|
||||||
cli = ["clap"]
|
|
||||||
unstable_wasm = ["fancy-regex", "getrandom/js"]
|
unstable_wasm = ["fancy-regex", "getrandom/js"]
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
@ -1,73 +0,0 @@
|
|||||||
//!
|
|
||||||
//! This is the CLI binary for the Tokenizers project
|
|
||||||
//!
|
|
||||||
|
|
||||||
use clap::{Parser, Subcommand};
|
|
||||||
use std::io::{self, BufRead, Write};
|
|
||||||
use tokenizers::models::bpe::BPE;
|
|
||||||
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
|
||||||
use tokenizers::tokenizer::{AddedToken, Result};
|
|
||||||
use tokenizers::Tokenizer;
|
|
||||||
|
|
||||||
/// Generate custom Tokenizers or use existing ones
|
|
||||||
#[derive(Parser, Debug)]
|
|
||||||
#[command(author, version)]
|
|
||||||
struct Args {
|
|
||||||
#[command(subcommand)]
|
|
||||||
command: Command,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Subcommand, Debug)]
|
|
||||||
enum Command {
|
|
||||||
Shell {
|
|
||||||
/// Path to the vocab.json file
|
|
||||||
vocab: String,
|
|
||||||
/// Path to the merges.txt file
|
|
||||||
merges: String,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
fn shell(vocab: &str, merges: &str) -> Result<()> {
|
|
||||||
let bpe = BPE::from_file(vocab, merges).build()?;
|
|
||||||
let mut tokenizer = Tokenizer::new(bpe);
|
|
||||||
tokenizer
|
|
||||||
.with_pre_tokenizer(ByteLevel::default())
|
|
||||||
.with_decoder(ByteLevel::default());
|
|
||||||
|
|
||||||
tokenizer.add_tokens(&[AddedToken::from(String::from("ing"), false).single_word(false)]);
|
|
||||||
tokenizer
|
|
||||||
.add_special_tokens(&[AddedToken::from(String::from("[ENT]"), true).single_word(true)]);
|
|
||||||
|
|
||||||
let stdin = io::stdin();
|
|
||||||
let mut handle = stdin.lock();
|
|
||||||
let mut buffer = String::new();
|
|
||||||
|
|
||||||
loop {
|
|
||||||
buffer.clear();
|
|
||||||
|
|
||||||
print!("\nEnter some text to tokenize:\n> ");
|
|
||||||
io::stdout().flush()?;
|
|
||||||
handle.read_line(&mut buffer)?;
|
|
||||||
let buffer = buffer.trim_end();
|
|
||||||
|
|
||||||
let timer = std::time::Instant::now();
|
|
||||||
let encoded = tokenizer.encode(buffer.to_owned(), false)?;
|
|
||||||
let elapsed = timer.elapsed();
|
|
||||||
println!("\nInput:\t\t{}", buffer);
|
|
||||||
println!("Tokens:\t\t{:?}", encoded.get_tokens());
|
|
||||||
println!("IDs:\t\t{:?}", encoded.get_ids());
|
|
||||||
println!("Offsets:\t{:?}", encoded.get_offsets());
|
|
||||||
println!(
|
|
||||||
"Decoded:\t{}",
|
|
||||||
tokenizer.decode(encoded.get_ids(), true).unwrap()
|
|
||||||
);
|
|
||||||
println!("Tokenized in {:?}", elapsed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
|
||||||
let args = Args::parse();
|
|
||||||
match args.command {
|
|
||||||
Command::Shell { vocab, merges } => shell(&vocab, &merges),
|
|
||||||
}
|
|
||||||
}
|
|
Reference in New Issue
Block a user