mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Adding rust audit. (#1099)
* Adding rust audit. * Update clap version + derive_builder (they clashed). * Ignoring specific CVE which can be ignored https://github.com/Azure/iot-identity-service/issues/481 * Updating python lock. * Revert `derive-builder` update. * Adding back help msg.
This commit is contained in:
8
.github/workflows/python.yml
vendored
8
.github/workflows/python.yml
vendored
@ -95,6 +95,14 @@ jobs:
|
||||
command: clippy
|
||||
args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings
|
||||
|
||||
- name: Run Audit
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: audit
|
||||
# ignoring specific CVE which probably isn't affecting this crate
|
||||
# https://github.com/chronotope/chrono/issues/602
|
||||
args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2020-0071
|
||||
|
||||
- name: Install
|
||||
working-directory: ./bindings/python
|
||||
run: |
|
||||
|
8
.github/workflows/rust.yml
vendored
8
.github/workflows/rust.yml
vendored
@ -81,6 +81,14 @@ jobs:
|
||||
command: test
|
||||
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
|
||||
|
||||
- name: Run Audit
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
command: audit
|
||||
# ignoring specific CVE which probably isn't affecting this crate
|
||||
# https://github.com/chronotope/chrono/issues/602
|
||||
args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2020-0071
|
||||
|
||||
# Verify that Readme.md is up to date.
|
||||
- name: Make sure, Readme generated from lib.rs matches actual Readme
|
||||
if: matrix.os == 'ubuntu-latest'
|
||||
|
870
bindings/node/native/Cargo.lock
generated
870
bindings/node/native/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
878
bindings/python/Cargo.lock
generated
878
bindings/python/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
[package]
|
||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>", "Nicolas Patry <patry.nicolas@protonmail.com>"]
|
||||
edition = "2018"
|
||||
name = "tokenizers"
|
||||
version = "0.13.2"
|
||||
@ -48,7 +48,7 @@ rayon = "1.3"
|
||||
rayon-cond = "0.1"
|
||||
serde = { version = "1.0", features = [ "derive" ] }
|
||||
serde_json = "1.0"
|
||||
clap = { version = "2.33", optional = true }
|
||||
clap = { version = "4.0", features=["derive"], optional = true }
|
||||
unicode-normalization-alignments = "0.1"
|
||||
unicode_categories = "0.1"
|
||||
unicode-segmentation = "1.6"
|
||||
@ -77,6 +77,6 @@ cli = ["clap"]
|
||||
unstable_wasm = ["fancy-regex", "getrandom/js"]
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.3"
|
||||
criterion = "0.4"
|
||||
tempfile = "3.1"
|
||||
assert_approx_eq = "1.1"
|
||||
|
@ -2,21 +2,32 @@
|
||||
//! This is the CLI binary for the Tokenizers project
|
||||
//!
|
||||
|
||||
use clap::{App, AppSettings, Arg, ArgMatches, SubCommand};
|
||||
use clap::{Parser, Subcommand};
|
||||
use std::io::{self, BufRead, Write};
|
||||
use tokenizers::models::bpe::BPE;
|
||||
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||
use tokenizers::tokenizer::{AddedToken, Result};
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
fn shell(matches: &ArgMatches) -> Result<()> {
|
||||
let vocab = matches
|
||||
.value_of("vocab")
|
||||
.expect("Must give a vocab.json file");
|
||||
let merges = matches
|
||||
.value_of("merges")
|
||||
.expect("Must give a merges.txt file");
|
||||
/// Generate custom Tokenizers or use existing ones
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version)]
|
||||
struct Args {
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum Command {
|
||||
Shell {
|
||||
/// Path to the vocab.json file
|
||||
vocab: String,
|
||||
/// Path to the merges.txt file
|
||||
merges: String,
|
||||
},
|
||||
}
|
||||
|
||||
fn shell(vocab: &str, merges: &str) -> Result<()> {
|
||||
let bpe = BPE::from_file(vocab, merges).build()?;
|
||||
let mut tokenizer = Tokenizer::new(bpe);
|
||||
tokenizer
|
||||
@ -55,33 +66,8 @@ fn shell(matches: &ArgMatches) -> Result<()> {
|
||||
}
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let matches = App::new("tokenizers")
|
||||
.version("0.0.1")
|
||||
.author("Anthony M. <anthony@huggingface.co>")
|
||||
.about("Generate custom Tokenizers or use existing ones")
|
||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
||||
.subcommand(
|
||||
SubCommand::with_name("shell")
|
||||
.about("Interactively test a tokenizer")
|
||||
.arg(
|
||||
Arg::with_name("vocab")
|
||||
.long("vocab")
|
||||
.value_name("VOCAB_FILE")
|
||||
.help("Path to the vocab.json file")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("merges")
|
||||
.long("merges")
|
||||
.value_name("MERGES_FILE")
|
||||
.help("Path to the merges.txt file")
|
||||
.required(true),
|
||||
),
|
||||
)
|
||||
.get_matches();
|
||||
|
||||
match matches.subcommand() {
|
||||
("shell", matches) => shell(matches.unwrap()),
|
||||
(subcommand, _) => panic!("Unknown subcommand {}", subcommand),
|
||||
let args = Args::parse();
|
||||
match args.command {
|
||||
Command::Shell { vocab, merges } => shell(&vocab, &merges),
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user