Node - Basic Tokenizer + BPE

This commit is contained in:
Anthony MOI
2020-01-09 00:04:53 -05:00
parent 13f3fbed30
commit 156d86d91e
8 changed files with 510 additions and 233 deletions

View File

@ -0,0 +1,8 @@
const tokenizers = require('..');
let bpe = tokenizers.models.BPE.from_files(
"../../data/gpt2-vocab.json",
"../../data/gpt2-merges.txt"
);
let tokenizer = new tokenizers.Tokenizer(bpe);
console.log(bpe, tokenizer);

View File

@ -1,8 +1,10 @@
var addon = require('../native');
let s = "Hey man!";
if (typeof process.argv[2] == 'string') {
s = process.argv[2];
exports.Tokenizer = addon.tokenizer_Tokenizer;
exports.models = {
BPE: {
from_files: addon.models_create_BPE_from_files,
empty: addon.models_create_BPE_empty,
},
WordPiece: addon.models_WordPiece,
}
console.log(addon.WhitespaceTokenizer.tokenize(s));

View File

@ -26,46 +26,9 @@ dependencies = [
]
[[package]]
name = "backtrace"
version = "0.3.40"
name = "autocfg"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"backtrace-sys 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
"rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "backtrace-sys"
version = "0.1.32"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cc 1.0.46 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "bindgen"
version = "0.50.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"cexpr 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"clang-sys 0.28.1 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
"fxhash 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"peeking_take_while 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
"proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"shlex 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
"which 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "bitflags"
@ -73,38 +36,23 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "byteorder"
version = "1.3.2"
name = "c2-chacha"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cc"
version = "1.0.46"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "cexpr"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"nom 4.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cfg-if"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "clang-sys"
version = "0.28.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"glob 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
"libloading 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "clap"
version = "2.33.0"
@ -119,52 +67,122 @@ dependencies = [
"vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "clicolors-control"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "console"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"clicolors-control 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encode_unicode 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"termios 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-deque"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-epoch"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"memoffset 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-queue"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "crossbeam-utils"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cslice"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "env_logger"
version = "0.6.2"
name = "either"
version = "1.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "encode_unicode"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "getrandom"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)",
"humantime 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)",
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
"wasi 0.9.0+wasi-snapshot-preview1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "failure"
name = "hermit-abi"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"backtrace 0.3.40 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "fxhash"
version = "0.2.1"
name = "indicatif"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"console 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"number_prefix 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "glob"
version = "0.3.0"
name = "itoa"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "humantime"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "lazy_static"
version = "1.4.0"
@ -175,28 +193,19 @@ name = "libc"
version = "0.2.65"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libloading"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cc 1.0.46 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "log"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "memchr"
version = "2.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "memoffset"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "neon"
version = "0.3.3"
@ -241,68 +250,85 @@ version = "0.1.0"
dependencies = [
"neon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"neon-build 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"tokenizers-lib 0.1.0",
"tokenizers 0.0.13",
]
[[package]]
name = "nom"
version = "4.2.3"
name = "num_cpus"
version = "1.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "onig"
version = "5.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"hermit-abi 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
"onig_sys 69.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "onig_sys"
version = "69.2.0"
name = "number_prefix"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "ppv-lite86"
version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rand"
version = "0.7.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bindgen 0.50.1 (registry+https://github.com/rust-lang/crates.io-index)",
"cc 1.0.46 (registry+https://github.com/rust-lang/crates.io-index)",
"pkg-config 0.3.16 (registry+https://github.com/rust-lang/crates.io-index)",
"getrandom 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_chacha 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_hc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "peeking_take_while"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "pkg-config"
version = "0.3.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "proc-macro2"
version = "0.4.30"
name = "rand_chacha"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"c2-chacha 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "quick-error"
version = "1.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "quote"
version = "0.6.13"
name = "rand_core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)",
"getrandom 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rayon"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
"either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon-core 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "rayon-core"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-queue 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"num_cpus 1.11.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -322,8 +348,21 @@ version = "0.6.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "rustc-demangle"
version = "0.1.16"
name = "rustc_version"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "ryu"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "scopeguard"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@ -340,8 +379,23 @@ version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "shlex"
version = "0.1.1"
name = "serde"
version = "1.0.104"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "serde_json"
version = "1.0.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"ryu 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.104 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "smallvec"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@ -350,11 +404,11 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "termcolor"
version = "1.0.5"
name = "termios"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"wincolor 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -374,11 +428,27 @@ dependencies = [
]
[[package]]
name = "tokenizers-lib"
version = "0.1.0"
name = "tokenizers"
version = "0.0.13"
dependencies = [
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
"indicatif 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"onig 5.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.44 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-normalization-alignments 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "unicode-normalization-alignments"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@ -387,8 +457,8 @@ version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "unicode-xid"
version = "0.1.0"
name = "unicode_categories"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@ -397,19 +467,10 @@ version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "version_check"
version = "0.1.5"
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "which"
version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"failure 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi"
version = "0.3.8"
@ -424,82 +485,70 @@ name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "winapi-util"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "wincolor"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
]
[metadata]
"checksum aho-corasick 0.7.6 (registry+https://github.com/rust-lang/crates.io-index)" = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d"
"checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b"
"checksum atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)" = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90"
"checksum backtrace 0.3.40 (registry+https://github.com/rust-lang/crates.io-index)" = "924c76597f0d9ca25d762c25a4d369d51267536465dc5064bdf0eb073ed477ea"
"checksum backtrace-sys 0.1.32 (registry+https://github.com/rust-lang/crates.io-index)" = "5d6575f128516de27e3ce99689419835fce9643a9b215a14d2b5b685be018491"
"checksum bindgen 0.50.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cb0e5a5f74b2bafe0b39379f616b5975e08bcaca4e779c078d5c31324147e9ba"
"checksum autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2"
"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
"checksum c2-chacha 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "214238caa1bf3a496ec3392968969cab8549f96ff30652c9e56885329315f6bb"
"checksum cc 1.0.46 (registry+https://github.com/rust-lang/crates.io-index)" = "0213d356d3c4ea2c18c40b037c3be23cd639825c18f25ee670ac7813beeef99c"
"checksum cexpr 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "a7fa24eb00d5ffab90eaeaf1092ac85c04c64aaf358ea6f84505b8116d24c6af"
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
"checksum clang-sys 0.28.1 (registry+https://github.com/rust-lang/crates.io-index)" = "81de550971c976f176130da4b2978d3b524eaa0fd9ac31f3ceb5ae1231fb4853"
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
"checksum clicolors-control 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "90082ee5dcdd64dc4e9e0d37fbf3ee325419e39c0092191e0393df65518f741e"
"checksum console 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)" = "f5d540c2d34ac9dd0deb5f3b5f54c36c79efa78f6b3ad19106a554d07a7b5d9f"
"checksum crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c3aa945d63861bfe624b55d153a39684da1e8c0bc8fba932f7ee3a3c16cea3ca"
"checksum crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5064ebdbf05ce3cb95e45c8b086f72263f4166b29b97f6baff7ef7fe047b55ac"
"checksum crossbeam-queue 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c695eeca1e7173472a32221542ae469b3e9aac3a4fc81f7696bcad82029493db"
"checksum crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ce446db02cdc3165b94ae73111e570793400d0794e46125cc4056c81cbb039f4"
"checksum cslice 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "697c714f50560202b1f4e2e09cd50a421881c83e9025db75d15f276616f04f40"
"checksum env_logger 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "aafcde04e90a5226a6443b7aabdb016ba2f8307c847d524724bd9b346dd1a2d3"
"checksum failure 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "f8273f13c977665c5db7eb2b99ae520952fe5ac831ae4cd09d80c4c7042b5ed9"
"checksum fxhash 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
"checksum glob 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574"
"checksum humantime 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f"
"checksum either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
"checksum encode_unicode 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
"checksum getrandom 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)" = "7abc8dd8451921606d809ba32e95b6111925cd2906060d2dcc29c070220503eb"
"checksum hermit-abi 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "eff2656d88f158ce120947499e971d743c05dbcbed62e5bd2f38f1698bbc3772"
"checksum indicatif 0.13.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8572bccfb0665e70b7faf44ee28841b8e0823450cd4ad562a76b5a3c4bf48487"
"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
"checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
"checksum libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)" = "1a31a0627fdf1f6a39ec0dd577e101440b7db22672c0901fe00a9a6fbb5c24e8"
"checksum libloading 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f2b111a074963af1d37a139918ac6d49ad1d0d5e47f72fd55388619691a7d753"
"checksum log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)" = "14b6052be84e6b71ab17edffc2eeabf5c2c3ae1fdb464aae35ac50c67a44e1f7"
"checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e"
"checksum memoffset 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "75189eb85871ea5c2e2c15abbdd541185f63b408415e5051f5cac122d8c774b9"
"checksum neon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "53b85accbbd250627f899a6fc1f220bbb4c8c2ff6dc71830dc6b752b39c2eb97"
"checksum neon-build 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ae406bf1065c4399e69d328a3bd8d4f088f2a205dc3881bf68c0ac775bfef337"
"checksum neon-runtime 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d8465ac4ed3f340dead85e053b75a5f639f48ac6343b3523eff90a751758eead"
"checksum neon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "8ae4cf3871ca5a395077e68144c1754e94e9e1e3329e7f8399d999ca573ed89a"
"checksum nom 4.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2ad2a91a8e869eeb30b9cb3119ae87773a8f4ae617f41b1eb9c154b2905f7bd6"
"checksum onig 5.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e4e723fc996fff1aeab8f62205f3e8528bf498bdd5eadb2784d2d31f30077947"
"checksum onig_sys 69.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0a8d4efbf5f59cece01f539305191485b651acb3785b9d5eef05749f0496514e"
"checksum peeking_take_while 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "19b17cddbe7ec3f8bc800887bab5e717348c95ea2ca0b1bf0837fb964dc67099"
"checksum pkg-config 0.3.16 (registry+https://github.com/rust-lang/crates.io-index)" = "72d5370d90f49f70bd033c3d75e87fc529fbfff9d6f7cccef07d6170079d91ea"
"checksum proc-macro2 0.4.30 (registry+https://github.com/rust-lang/crates.io-index)" = "cf3d2011ab5c909338f7887f4fc896d35932e29146c12c8d01da6b22a80ba759"
"checksum quick-error 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "9274b940887ce9addde99c4eee6b5c44cc494b182b97e73dc8ffdcb3397fd3f0"
"checksum quote 0.6.13 (registry+https://github.com/rust-lang/crates.io-index)" = "6ce23b6b870e8f94f81fb0a363d65d86675884b34a09043c81e5562f11c1f8e1"
"checksum num_cpus 1.11.1 (registry+https://github.com/rust-lang/crates.io-index)" = "76dac5ed2a876980778b8b85f75a71b6cbf0db0b1232ee12f826bccb00d09d72"
"checksum number_prefix 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
"checksum ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "74490b50b9fbe561ac330df47c08f3f33073d2d00c150f719147d7c54522fa1b"
"checksum rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3ae1b169243eaf61759b8475a998f0a385e42042370f3a7dbaf35246eacc8412"
"checksum rand_chacha 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "03a2a90da8c7523f554344f921aa97283eadf6ac484a6d2a7d0212fa7f8d6853"
"checksum rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
"checksum rand_hc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
"checksum rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098"
"checksum rayon-core 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9"
"checksum regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dc220bd33bdce8f093101afe22a037b8eb0e5af33592e6a9caafff0d4cb81cbd"
"checksum regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)" = "11a7e20d1cce64ef2fed88b66d347f88bd9babb82845b2b858f3edbf59a4f716"
"checksum rustc-demangle 0.1.16 (registry+https://github.com/rust-lang/crates.io-index)" = "4c691c0e608126e00913e33f0ccf3727d5fc84573623b8d65b2df340b5201783"
"checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
"checksum ryu 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "bfa8506c1de11c9c4e4c38863ccbe02a305c8188e85a05a784c9e11e1c3910c8"
"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
"checksum shlex 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7fdf1b9db47230893d76faad238fd6097fd6d6a9245cd7a4d90dbd639536bbd2"
"checksum serde 1.0.104 (registry+https://github.com/rust-lang/crates.io-index)" = "414115f25f818d7dfccec8ee535d76949ae78584fc4f79a6f45a904bf8ab4449"
"checksum serde_json 1.0.44 (registry+https://github.com/rust-lang/crates.io-index)" = "48c575e0cc52bdd09b47f330f646cf59afc586e9c4e3ccd6fc1f625b8ea1dad7"
"checksum smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "44e59e0c9fa00817912ae6e4e6e3c4fe04455e75699d06eedc7d85917ed8e8f4"
"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
"checksum termcolor 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "96d6098003bde162e4277c70665bd87c326f5a0c3f3fbfb285787fa482d54e6e"
"checksum termios 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "72b620c5ea021d75a735c943269bb07d30c9b77d6ac6b236bc8b5c496ef05625"
"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b"
"checksum unicode-normalization-alignments 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de"
"checksum unicode-width 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7007dbd421b92cc6e28410fe7362e2e0a2503394908f417b68ec8d1c364c4e20"
"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc"
"checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a"
"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd"
"checksum which 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b57acb10231b9493c8472b20cb57317d0679a49e0bdbee44b3b803a6473af164"
"checksum wasi 0.9.0+wasi-snapshot-preview1 (registry+https://github.com/rust-lang/crates.io-index)" = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
"checksum winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
"checksum winapi-util 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7168bab6e1daee33b4557efd0e95d5ca70a03706d39fa5f3fe7a236f584b03c9"
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
"checksum wincolor 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "96f5016b18804d24db43cebf3c77269e7569b8954a8464501c216cc5e070eaa9"

View File

@ -15,4 +15,4 @@ neon-build = "0.3.3"
[dependencies]
neon = "0.3.3"
tokenizers-lib = { path = "../../../tokenizers" }
tokenizers = { path = "../../../tokenizers" }

View File

@ -1,27 +1,17 @@
#[macro_use]
extern crate neon;
extern crate tokenizers as tk;
mod models;
mod tokenizer;
mod utils;
use neon::prelude::*;
fn tokenize(mut cx: FunctionContext) -> JsResult<JsArray> {
let s = cx.argument::<JsString>(0)?.value();
println!("Tokenizing in rust: {:?}", s);
let result = tk::WhitespaceTokenizer::tokenize(&s);
let js_array = JsArray::new(&mut cx, result.len() as u32);
for (i, token) in result.iter().enumerate() {
let n = cx.string(token);
js_array.set(&mut cx, i as u32, n)?;
}
Ok(js_array)
}
register_module!(mut m, {
// Tokenizer
tokenizer::register(&mut m, "tokenizer")?;
// Models
models::register(&mut m, "models")?;
register_module!(mut cx, {
let whitespace_tokenizer = JsObject::new(&mut cx);
let tokenize = JsFunction::new(&mut cx, tokenize).unwrap();
whitespace_tokenizer
.set(&mut cx, "tokenize", tokenize)
.unwrap();
cx.export_value("WhitespaceTokenizer", whitespace_tokenizer);
Ok(())
});

View File

@ -0,0 +1,97 @@
extern crate tokenizers as tk;
use crate::utils::Container;
use neon::prelude::*;
/// Model
pub struct Model {
pub model: Container<dyn tk::tokenizer::Model + Sync>,
}
declare_types! {
pub class JsModel for Model {
init(_) {
// This should not be called from JS
Ok(Model {
model: Container::Empty
})
}
}
}
/// create_BPE(vocab: String, merges: String, options?: {
/// cache_capacity?: number,
/// dropout?: number,
/// unk_token?: String,
/// continuing_subword_prefix?: String,
/// end_of_word_suffix?: String
/// })
pub fn create_bpe_from_files(mut cx: FunctionContext) -> JsResult<JsModel> {
let vocab = cx.argument::<JsString>(0)?.value() as String;
let merges = cx.argument::<JsString>(1)?.value() as String;
let options = cx.argument_opt(2);
let mut model = JsModel::new::<_, JsModel, _>(&mut cx, vec![])?;
let mut builder = tk::models::bpe::BPE::from_files(&vocab, &merges)
.or_else(|e| cx.throw_error(format!("{}", e)))?;
if let Some(options) = options {
if let Ok(options) = options.downcast::<JsObject>() {
if let Ok(cache_capacity) = options.get(&mut cx, "cache_capacity") {
let cache_capacity = cache_capacity
.downcast::<JsNumber>()
.or_throw(&mut cx)?
.value() as usize;
builder = builder.cache_capacity(cache_capacity);
}
if let Ok(dropout) = options.get(&mut cx, "dropout") {
let dropout = dropout.downcast::<JsNumber>().or_throw(&mut cx)?.value() as f32;
builder = builder.dropout(dropout);
}
if let Ok(unk_token) = options.get(&mut cx, "unk_token") {
let unk_token =
unk_token.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
builder = builder.unk_token(unk_token);
}
if let Ok(prefix) = options.get(&mut cx, "continuing_subword_prefix") {
let prefix = prefix.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
builder = builder.continuing_subword_prefix(prefix);
}
if let Ok(suffix) = options.get(&mut cx, "end_of_word_suffix") {
let suffix = suffix.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
builder = builder.end_of_word_suffix(suffix);
}
}
}
match builder.build() {
Ok(bpe) => {
let guard = cx.lock();
model.borrow_mut(&guard).model.to_owned(Box::new(bpe));
}
Err(e) => return cx.throw_error(format!("{}", e)),
};
Ok(model)
}
/// create_bpe_empty()
pub fn create_bpe_empty(mut cx: FunctionContext) -> JsResult<JsModel> {
let mut model = JsModel::new::<_, JsModel, _>(&mut cx, vec![])?;
let bpe = tk::models::bpe::BPE::default();
let guard = cx.lock();
model.borrow_mut(&guard).model.to_owned(Box::new(bpe));
Ok(model)
}
/// Register everything here
pub fn register(m: &mut ModuleContext, prefix: &str) -> Result<(), neon::result::Throw> {
m.export_function(
&format!("{}_create_BPE_from_files", prefix),
create_bpe_from_files,
)?;
m.export_function(&format!("{}_create_BPE_empty", prefix), create_bpe_empty)?;
Ok(())
}

View File

@ -0,0 +1,54 @@
extern crate tokenizers as tk;
use crate::models::*;
use neon::prelude::*;
/// Tokenizer
pub struct Tokenizer {
tokenizer: tk::tokenizer::Tokenizer,
}
declare_types! {
pub class JsTokenizer for Tokenizer {
// Create
init(mut cx) {
let mut model = cx.argument::<JsModel>(0)?;
if let Some(instance) = {
let guard = cx.lock();
let mut model = model.borrow_mut(&guard);
model.model.to_pointer()
} {
Ok(Tokenizer {
tokenizer: tk::tokenizer::Tokenizer::new(instance)
})
} else {
cx.throw_error("The Model is already being used in another Tokenizer")
}
}
method with_model(mut cx) {
let mut model = cx.argument::<JsModel>(0)?;
if let Some(instance) = {
let guard = cx.lock();
let mut model = model.borrow_mut(&guard);
model.model.to_pointer()
} {
let mut this = cx.this();
{
let guard = cx.lock();
let mut tokenizer = this.borrow_mut(&guard);
tokenizer.tokenizer.with_model(instance);
}
Ok(cx.undefined().upcast())
} else {
cx.throw_error("The Model is already being used in another Tokenizer")
}
}
}
}
pub fn register(m: &mut ModuleContext, prefix: &str) -> Result<(), neon::result::Throw> {
m.export_class::<JsTokenizer>(&format!("{}_Tokenizer", prefix))?;
Ok(())
}

View File

@ -0,0 +1,77 @@
/// A Container type
///
/// Provides an interface to allow transfer of ownership between Python and Rust.
/// It either contains a Box with full ownership of the content, or a pointer to the content.
///
/// The main goal here is to allow Python calling into Rust to initialize some objects. Later
/// these objects may need to be used by Rust who will expect to take ownership. Since Python
/// does not allow any sort of ownership transfer, it will keep a reference to this object
/// until it gets cleaned up by the GC. In this case, we actually give the ownership to Rust,
/// and just keep a pointer in the Python object.
pub enum Container<T: ?Sized> {
Owned(Box<T>),
Pointer(*mut T),
Empty,
}
impl<T> Container<T>
where
T: ?Sized,
{
pub fn from_ref(reference: &Box<T>) -> Self {
let content: *const T = &**reference;
Container::Pointer(content as *mut _)
}
/// Consumes ourself and return the Boxed element if we have the ownership, None otherwise.
pub fn take(self) -> Option<Box<T>> {
match self {
Container::Owned(obj) => Some(obj),
Container::Pointer(_) => None,
Container::Empty => None,
}
}
/// Replace an empty content by the new provided owned one, otherwise do nothing
pub fn to_owned(&mut self, o: Box<T>) {
if let Container::Empty = self {
unsafe {
let new_container = Container::Owned(o);
std::ptr::write(self, new_container);
}
}
}
/// Return the owned T, keeping a Pointer to it if we currently own it. None otherwise
pub fn to_pointer(&mut self) -> Option<Box<T>> {
if let Container::Owned(_) = self {
unsafe {
let old_container = std::ptr::read(self);
let ptr = Box::into_raw(old_container.take().unwrap());
let new_container = Container::Pointer(ptr);
std::ptr::write(self, new_container);
Some(Box::from_raw(ptr))
}
} else {
None
}
}
pub fn execute<F, U>(&self, closure: F) -> U
where
F: FnOnce(Option<&Box<T>>) -> U,
{
match self {
Container::Owned(val) => closure(Some(val)),
Container::Pointer(ptr) => unsafe {
let val = Box::from_raw(*ptr);
let res = closure(Some(&val));
// We call this to make sure we don't drop the Box
Box::into_raw(val);
res
},
Container::Empty => closure(None),
}
}
}