diff --git a/tokenizers/Cargo.lock b/tokenizers/Cargo.lock index c9b2f499..79c664a5 100644 --- a/tokenizers/Cargo.lock +++ b/tokenizers/Cargo.lock @@ -25,6 +25,11 @@ dependencies = [ "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "autocfg" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "backtrace" version = "0.3.40" @@ -119,6 +124,60 @@ dependencies = [ "vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "crossbeam-deque" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "memoffset 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-queue" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "crossbeam-utils" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)", + "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "either" +version = "1.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "env_logger" version = "0.6.2" @@ -152,6 +211,14 @@ name = "glob" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "hermit-abi" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "humantime" version = "1.3.0" @@ -192,6 +259,14 @@ name = "memchr" version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "memoffset" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "nom" version = "4.2.3" @@ -201,6 +276,15 @@ dependencies = [ "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "num_cpus" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "hermit-abi 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "onig" version = "5.0.0" @@ -253,6 +337,28 @@ dependencies = [ "proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "rayon" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", + "either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon-core 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "rayon-core" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "num_cpus 1.11.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "regex" version = "1.3.1" @@ -274,6 +380,32 @@ name = "rustc-demangle" version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "rustc_version" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "scopeguard" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "semver" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "semver-parser" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "shlex" version = "0.1.1" @@ -310,10 +442,11 @@ dependencies = [ [[package]] name = "tokenizers-lib" -version = "0.1.0" +version = "0.0.1" dependencies = [ "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", "onig 5.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -385,6 +518,7 @@ dependencies = [ "checksum aho-corasick 0.7.6 (registry+https://github.com/rust-lang/crates.io-index)" = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d" "checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" "checksum atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)" = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" +"checksum autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2" "checksum backtrace 0.3.40 (registry+https://github.com/rust-lang/crates.io-index)" = "924c76597f0d9ca25d762c25a4d369d51267536465dc5064bdf0eb073ed477ea" "checksum backtrace-sys 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6575f128516de27e3ce99689419835fce9643a9b215a14d2b5b685be018491" "checksum bindgen 0.50.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cb0e5a5f74b2bafe0b39379f616b5975e08bcaca4e779c078d5c31324147e9ba" @@ -395,17 +529,26 @@ dependencies = [ "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" "checksum clang-sys 0.28.1 (registry+https://github.com/rust-lang/crates.io-index)" = "81de550971c976f176130da4b2978d3b524eaa0fd9ac31f3ceb5ae1231fb4853" "checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9" +"checksum crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c3aa945d63861bfe624b55d153a39684da1e8c0bc8fba932f7ee3a3c16cea3ca" +"checksum crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5064ebdbf05ce3cb95e45c8b086f72263f4166b29b97f6baff7ef7fe047b55ac" +"checksum crossbeam-queue 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7c979cd6cfe72335896575c6b5688da489e420d36a27a0b9eb0c73db574b4a4b" +"checksum crossbeam-utils 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "04973fa96e96579258a5091af6003abde64af786b860f18622b82e026cca60e6" +"checksum crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ce446db02cdc3165b94ae73111e570793400d0794e46125cc4056c81cbb039f4" +"checksum either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3" "checksum env_logger 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "aafcde04e90a5226a6443b7aabdb016ba2f8307c847d524724bd9b346dd1a2d3" "checksum failure 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "f8273f13c977665c5db7eb2b99ae520952fe5ac831ae4cd09d80c4c7042b5ed9" "checksum fxhash 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" "checksum glob 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" +"checksum hermit-abi 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "307c3c9f937f38e3534b1d6447ecf090cafcc9744e4a6360e8b037b2cf5af120" "checksum humantime 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" "checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" "checksum libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)" = "1a31a0627fdf1f6a39ec0dd577e101440b7db22672c0901fe00a9a6fbb5c24e8" "checksum libloading 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753" "checksum log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7" "checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e" +"checksum memoffset 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "75189eb85871ea5c2e2c15abbdd541185f63b408415e5051f5cac122d8c774b9" "checksum nom 4.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2ad2a91a8e869eeb30b9cb3119ae87773a8f4ae617f41b1eb9c154b2905f7bd6" +"checksum num_cpus 1.11.1 (registry+https://github.com/rust-lang/crates.io-index)" = "76dac5ed2a876980778b8b85f75a71b6cbf0db0b1232ee12f826bccb00d09d72" "checksum onig 5.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e4e723fc996fff1aeab8f62205f3e8528bf498bdd5eadb2784d2d31f30077947" "checksum onig_sys 69.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0a8d4efbf5f59cece01f539305191485b651acb3785b9d5eef05749f0496514e" "checksum peeking_take_while 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099" @@ -413,9 +556,15 @@ dependencies = [ "checksum proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)" = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759" "checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0" "checksum quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1" +"checksum rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "83a27732a533a1be0a0035a111fe76db89ad312f6f0347004c220c57f209a123" +"checksum rayon-core 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "98dcf634205083b17d0861252431eb2acbfb698ab7478a2d20de07954f47ec7b" "checksum regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dc220bd33bdce8f093101afe22a037b8eb0e5af33592e6a9caafff0d4cb81cbd" "checksum regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)" = "11a7e20d1cce64ef2fed88b66d347f88bd9babb82845b2b858f3edbf59a4f716" "checksum rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783" +"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a" +"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d" +"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403" +"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3" "checksum shlex 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2" "checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" "checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e" diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 9f595c55..39613ca4 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -1,17 +1,17 @@ [package] -name = "tokenizers-lib" -version = "0.0.1" authors = ["Anthony MOI "] edition = "2018" - -[lib] -name = "tokenizers" -path = "src/lib.rs" - +name = "tokenizers-lib" +version = "0.0.1" [[bin]] name = "cli" path = "src/cli.rs" [dependencies] -onig = "5.0.0" lazy_static = "1.3.0" +onig = "5.0.0" +rayon = "1.2.0" + +[lib] +name = "tokenizers" +path = "src/lib.rs" diff --git a/tokenizers/src/tokenizer.rs b/tokenizers/src/tokenizer.rs new file mode 100644 index 00000000..d1625bf9 --- /dev/null +++ b/tokenizers/src/tokenizer.rs @@ -0,0 +1,124 @@ +//! +//! # Tokenizer module +//! +//! Represents a tokenization pipeline. +//! +//! A Tokenizer is composed of some of the following parts. +//! - Normalizer: Takes care of the text normalization (like unicode normalization). +//! - PreTokenizer: Takes care of the pre tokenization (ie. How to split tokens and pre-process +//! them. +//! - Model: A model encapsulates the tokenization algorithm. (Like BPE, Word base, character +//! based, ...) +//! - PostProcessor: Takes care of the processing after tokenization. (Like truncating, padding, +//! ...) +//! +use rayon::prelude::*; + +/// A Normalizer takes care of pre-processing strings +pub trait Normalizer { + fn normalize(&self, s: &str) -> String; +} + +/// A PreTokenizer takes care of pre-tokenizing strings before this goes to the model +pub trait PreTokenizer { + fn pre_tokenize(&self, s: &str) -> Vec; +} + +/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram) +pub trait Model { + fn tokenize(&self, tokens: Vec) -> Vec; +} + +/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer. +/// Truncating, Padding, etc... are PostProcessor steps +pub trait PostProcessor { + fn process(&self, tokens: Vec) -> Vec; +} + +/// A Token represents the output of the Tokenizer +#[derive(Debug, PartialEq)] +pub struct Token { + pub id: u32, + pub value: String, + pub offsets: (usize, usize), + // TODO: Find out the best way to define the customizable part (For post processing steps) +} +impl Token { + pub fn new(id: u32, value: String, offsets: (usize, usize)) -> Self { + Token { id, value, offsets } + } +} + +/// +/// ## Tokenizer +/// +/// A Tokenizer is capable of encoding/decoding any text +/// +pub struct Tokenizer { + normalizers: Vec>, + pre_tokenizer: Option>, + model: Box, + post_processors: Vec>, +} + +impl Tokenizer { + /// Instanciate a new Tokenizer, with the given Model + pub fn new(model: Box) -> Self { + Tokenizer { + normalizers: vec![], + pre_tokenizer: None, + model, + post_processors: vec![], + } + } + + /// Set the normalizers + pub fn with_normalizers(&mut self, normalizers: Vec>) -> &Self { + self.normalizers = normalizers; + self + } + + /// Set the pre tokenizer + pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box) -> &Self { + self.pre_tokenizer = Some(pre_tokenizer); + self + } + + /// Set the post processors + pub fn with_post_processors( + &mut self, + post_processors: Vec>, + ) -> &Self { + self.post_processors = post_processors; + self + } + + /// Set the model + pub fn with_model(&mut self, model: Box) -> &Self { + self.model = model; + self + } + + /// Encode the given sentence + pub fn encode(&self, sentence: &str) -> Vec { + let pre_tokenized = match &self.pre_tokenizer { + None => vec![sentence.to_owned()], + Some(pre_tokenizer) => pre_tokenizer.pre_tokenize(sentence), + }; + + self.model.tokenize(pre_tokenized) + } + + /// Encode all the sentences in parallel, using multiple threads + pub fn encode_batch(&self, sentences: Vec<&str>) -> Vec> { + sentences + .par_iter() + .map(|sentence| self.encode(sentence)) + .collect() + } + + /// Decode the given ids, back to a String + pub fn decode(&self, tokens: Vec) -> String { + unimplemented!("Decode is not implemented yet"); + } +}