mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Adding rust audit. (#1099)
* Adding rust audit. * Update clap version + derive_builder (they clashed). * Ignoring specific CVE which can be ignored https://github.com/Azure/iot-identity-service/issues/481 * Updating python lock. * Revert `derive-builder` update. * Adding back help msg.
This commit is contained in:
8
.github/workflows/python.yml
vendored
8
.github/workflows/python.yml
vendored
@ -95,6 +95,14 @@ jobs:
|
|||||||
command: clippy
|
command: clippy
|
||||||
args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings
|
args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings
|
||||||
|
|
||||||
|
- name: Run Audit
|
||||||
|
uses: actions-rs/cargo@v1
|
||||||
|
with:
|
||||||
|
command: audit
|
||||||
|
# ignoring specific CVE which probably isn't affecting this crate
|
||||||
|
# https://github.com/chronotope/chrono/issues/602
|
||||||
|
args: -D warnings -f ./bindings/python/Cargo.lock --ignore RUSTSEC-2020-0071
|
||||||
|
|
||||||
- name: Install
|
- name: Install
|
||||||
working-directory: ./bindings/python
|
working-directory: ./bindings/python
|
||||||
run: |
|
run: |
|
||||||
|
8
.github/workflows/rust.yml
vendored
8
.github/workflows/rust.yml
vendored
@ -81,6 +81,14 @@ jobs:
|
|||||||
command: test
|
command: test
|
||||||
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
|
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
|
||||||
|
|
||||||
|
- name: Run Audit
|
||||||
|
uses: actions-rs/cargo@v1
|
||||||
|
with:
|
||||||
|
command: audit
|
||||||
|
# ignoring specific CVE which probably isn't affecting this crate
|
||||||
|
# https://github.com/chronotope/chrono/issues/602
|
||||||
|
args: -D warnings -f ./tokenizers/Cargo.lock --ignore RUSTSEC-2020-0071
|
||||||
|
|
||||||
# Verify that Readme.md is up to date.
|
# Verify that Readme.md is up to date.
|
||||||
- name: Make sure, Readme generated from lib.rs matches actual Readme
|
- name: Make sure, Readme generated from lib.rs matches actual Readme
|
||||||
if: matrix.os == 'ubuntu-latest'
|
if: matrix.os == 'ubuntu-latest'
|
||||||
|
870
bindings/node/native/Cargo.lock
generated
870
bindings/node/native/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
878
bindings/python/Cargo.lock
generated
878
bindings/python/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
|||||||
[package]
|
[package]
|
||||||
authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
|
authors = ["Anthony MOI <m.anthony.moi@gmail.com>", "Nicolas Patry <patry.nicolas@protonmail.com>"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
name = "tokenizers"
|
name = "tokenizers"
|
||||||
version = "0.13.2"
|
version = "0.13.2"
|
||||||
@ -48,7 +48,7 @@ rayon = "1.3"
|
|||||||
rayon-cond = "0.1"
|
rayon-cond = "0.1"
|
||||||
serde = { version = "1.0", features = [ "derive" ] }
|
serde = { version = "1.0", features = [ "derive" ] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
clap = { version = "2.33", optional = true }
|
clap = { version = "4.0", features=["derive"], optional = true }
|
||||||
unicode-normalization-alignments = "0.1"
|
unicode-normalization-alignments = "0.1"
|
||||||
unicode_categories = "0.1"
|
unicode_categories = "0.1"
|
||||||
unicode-segmentation = "1.6"
|
unicode-segmentation = "1.6"
|
||||||
@ -77,6 +77,6 @@ cli = ["clap"]
|
|||||||
unstable_wasm = ["fancy-regex", "getrandom/js"]
|
unstable_wasm = ["fancy-regex", "getrandom/js"]
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
criterion = "0.3"
|
criterion = "0.4"
|
||||||
tempfile = "3.1"
|
tempfile = "3.1"
|
||||||
assert_approx_eq = "1.1"
|
assert_approx_eq = "1.1"
|
||||||
|
@ -2,21 +2,32 @@
|
|||||||
//! This is the CLI binary for the Tokenizers project
|
//! This is the CLI binary for the Tokenizers project
|
||||||
//!
|
//!
|
||||||
|
|
||||||
use clap::{App, AppSettings, Arg, ArgMatches, SubCommand};
|
use clap::{Parser, Subcommand};
|
||||||
use std::io::{self, BufRead, Write};
|
use std::io::{self, BufRead, Write};
|
||||||
use tokenizers::models::bpe::BPE;
|
use tokenizers::models::bpe::BPE;
|
||||||
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
|
||||||
use tokenizers::tokenizer::{AddedToken, Result};
|
use tokenizers::tokenizer::{AddedToken, Result};
|
||||||
use tokenizers::Tokenizer;
|
use tokenizers::Tokenizer;
|
||||||
|
|
||||||
fn shell(matches: &ArgMatches) -> Result<()> {
|
/// Generate custom Tokenizers or use existing ones
|
||||||
let vocab = matches
|
#[derive(Parser, Debug)]
|
||||||
.value_of("vocab")
|
#[command(author, version)]
|
||||||
.expect("Must give a vocab.json file");
|
struct Args {
|
||||||
let merges = matches
|
#[command(subcommand)]
|
||||||
.value_of("merges")
|
command: Command,
|
||||||
.expect("Must give a merges.txt file");
|
}
|
||||||
|
|
||||||
|
#[derive(Subcommand, Debug)]
|
||||||
|
enum Command {
|
||||||
|
Shell {
|
||||||
|
/// Path to the vocab.json file
|
||||||
|
vocab: String,
|
||||||
|
/// Path to the merges.txt file
|
||||||
|
merges: String,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
fn shell(vocab: &str, merges: &str) -> Result<()> {
|
||||||
let bpe = BPE::from_file(vocab, merges).build()?;
|
let bpe = BPE::from_file(vocab, merges).build()?;
|
||||||
let mut tokenizer = Tokenizer::new(bpe);
|
let mut tokenizer = Tokenizer::new(bpe);
|
||||||
tokenizer
|
tokenizer
|
||||||
@ -55,33 +66,8 @@ fn shell(matches: &ArgMatches) -> Result<()> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
let matches = App::new("tokenizers")
|
let args = Args::parse();
|
||||||
.version("0.0.1")
|
match args.command {
|
||||||
.author("Anthony M. <anthony@huggingface.co>")
|
Command::Shell { vocab, merges } => shell(&vocab, &merges),
|
||||||
.about("Generate custom Tokenizers or use existing ones")
|
|
||||||
.setting(AppSettings::SubcommandRequiredElseHelp)
|
|
||||||
.subcommand(
|
|
||||||
SubCommand::with_name("shell")
|
|
||||||
.about("Interactively test a tokenizer")
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("vocab")
|
|
||||||
.long("vocab")
|
|
||||||
.value_name("VOCAB_FILE")
|
|
||||||
.help("Path to the vocab.json file")
|
|
||||||
.required(true),
|
|
||||||
)
|
|
||||||
.arg(
|
|
||||||
Arg::with_name("merges")
|
|
||||||
.long("merges")
|
|
||||||
.value_name("MERGES_FILE")
|
|
||||||
.help("Path to the merges.txt file")
|
|
||||||
.required(true),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
|
|
||||||
match matches.subcommand() {
|
|
||||||
("shell", matches) => shell(matches.unwrap()),
|
|
||||||
(subcommand, _) => panic!("Unknown subcommand {}", subcommand),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user