mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Adding unstable_wasm
feature + example to run tokenizers
on wasm. (#1009)
* Adding `unstable_wasm` feature + example to run `tokenizers` on wasm. Co-Authored-By: josephrocca <1167575+josephrocca@users.noreply.github.com> Co-Authored-By: Matthias Brunel <matthias.brunel@mithrilsecurity.io> * Adding some serialization tests. * Updating with comments. Co-authored-by: josephrocca <1167575+josephrocca@users.noreply.github.com> Co-authored-by: Matthias Brunel <matthias.brunel@mithrilsecurity.io>
This commit is contained in:
87
bindings/node/native/Cargo.lock
generated
87
bindings/node/native/Cargo.lock
generated
@ -162,7 +162,7 @@ dependencies = [
|
||||
"glob",
|
||||
"indicatif 0.16.2",
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"rand",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@ -439,9 +439,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "esaxx-rs"
|
||||
version = "0.1.7"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f4617b351b734e97a3dd32022a721471349aa3038d4132beee8568cdfa7e716"
|
||||
checksum = "1f748b253ceca9fed5f42f8b5ceb3851e93102199bc25b64b65369f76e5c0a35"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
@ -586,24 +586,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.1.16"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
|
||||
checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"libc",
|
||||
"wasi 0.9.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d39cd93900197114fa1fcb7ae84ca742095eed9442088988ae74fa744e930e77"
|
||||
dependencies = [
|
||||
"cfg-if 1.0.0",
|
||||
"libc",
|
||||
"wasi 0.10.2+wasi-snapshot-preview1",
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -843,9 +832,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.119"
|
||||
version = "0.2.126"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1bf2e165bb3457c8e098ea76f3e3bc9db55f87aa90d52d0e6be741470916aaa4"
|
||||
checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
@ -1251,19 +1240,6 @@ version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
"libc",
|
||||
"rand_chacha 0.2.2",
|
||||
"rand_core 0.5.1",
|
||||
"rand_hc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
@ -1271,18 +1247,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha 0.3.1",
|
||||
"rand_core 0.6.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.5.1",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1292,16 +1258,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.6.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1310,16 +1267,7 @@ version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
|
||||
dependencies = [
|
||||
"getrandom 0.2.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
|
||||
dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1373,7 +1321,7 @@ version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "528532f3d801c87aec9def2add9ca802fe569e44a544afe633765267840abe64"
|
||||
dependencies = [
|
||||
"getrandom 0.2.5",
|
||||
"getrandom",
|
||||
"redox_syscall",
|
||||
]
|
||||
|
||||
@ -1727,6 +1675,7 @@ dependencies = [
|
||||
"derive_builder",
|
||||
"dirs",
|
||||
"esaxx-rs",
|
||||
"getrandom",
|
||||
"indicatif 0.15.0",
|
||||
"itertools 0.9.0",
|
||||
"lazy_static",
|
||||
@ -1734,7 +1683,7 @@ dependencies = [
|
||||
"macro_rules_attribute",
|
||||
"onig",
|
||||
"paste",
|
||||
"rand 0.7.3",
|
||||
"rand",
|
||||
"rayon",
|
||||
"rayon-cond",
|
||||
"regex",
|
||||
@ -1915,12 +1864,6 @@ dependencies = [
|
||||
"try-lock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.9.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.10.2+wasi-snapshot-preview1"
|
||||
|
81
bindings/python/Cargo.lock
generated
81
bindings/python/Cargo.lock
generated
@ -138,7 +138,7 @@ dependencies = [
|
||||
"glob",
|
||||
"indicatif 0.16.2",
|
||||
"log",
|
||||
"rand 0.8.5",
|
||||
"rand",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
@ -407,9 +407,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "esaxx-rs"
|
||||
version = "0.1.7"
|
||||
version = "0.1.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f4617b351b734e97a3dd32022a721471349aa3038d4132beee8568cdfa7e716"
|
||||
checksum = "1f748b253ceca9fed5f42f8b5ceb3851e93102199bc25b64b65369f76e5c0a35"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
@ -554,20 +554,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.1.16"
|
||||
version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi 0.9.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d39cd93900197114fa1fcb7ae84ca742095eed9442088988ae74fa744e930e77"
|
||||
checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
@ -1288,19 +1277,6 @@ version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "941ba9d78d8e2f7ce474c015eea4d9c6d25b6a3327f9832ee29a4de27f91bbb8"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
"libc",
|
||||
"rand_chacha 0.2.2",
|
||||
"rand_core 0.5.1",
|
||||
"rand_hc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
@ -1308,18 +1284,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha 0.3.1",
|
||||
"rand_core 0.6.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.5.1",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1329,16 +1295,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.6.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1347,16 +1304,7 @@ version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
|
||||
dependencies = [
|
||||
"getrandom 0.2.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
|
||||
dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -1416,7 +1364,7 @@ version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7776223e2696f1aa4c6b0170e83212f47296a00424305117d013dfe86fb0fe55"
|
||||
dependencies = [
|
||||
"getrandom 0.2.5",
|
||||
"getrandom",
|
||||
"redox_syscall",
|
||||
"thiserror",
|
||||
]
|
||||
@ -1759,6 +1707,7 @@ dependencies = [
|
||||
"derive_builder",
|
||||
"dirs",
|
||||
"esaxx-rs",
|
||||
"getrandom",
|
||||
"indicatif 0.15.0",
|
||||
"itertools 0.9.0",
|
||||
"lazy_static",
|
||||
@ -1766,7 +1715,7 @@ dependencies = [
|
||||
"macro_rules_attribute",
|
||||
"onig",
|
||||
"paste",
|
||||
"rand 0.7.3",
|
||||
"rand",
|
||||
"rayon",
|
||||
"rayon-cond",
|
||||
"regex",
|
||||
@ -1971,12 +1920,6 @@ dependencies = [
|
||||
"try-lock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.9.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.10.2+wasi-snapshot-preview1"
|
||||
|
@ -36,8 +36,8 @@ harness = false
|
||||
|
||||
[dependencies]
|
||||
lazy_static = "1.4"
|
||||
rand = "0.7"
|
||||
onig = { version = "6.0", default-features = false }
|
||||
rand = "0.8"
|
||||
onig = { version = "6.0", default-features = false, optional = true }
|
||||
regex = "1.3"
|
||||
regex-syntax = "0.6"
|
||||
rayon = "1.3"
|
||||
@ -51,7 +51,6 @@ unicode-segmentation = "1.6"
|
||||
indicatif = {version = "0.15", optional = true}
|
||||
itertools = "0.9"
|
||||
log = "0.4"
|
||||
esaxx-rs = "0.1"
|
||||
derive_builder = "0.9"
|
||||
spm_precompiled = "0.1"
|
||||
dirs = "3.0"
|
||||
@ -61,12 +60,17 @@ aho-corasick = "0.7"
|
||||
paste = "1.0.6"
|
||||
macro_rules_attribute = "0.0.2"
|
||||
thiserror = "1.0.30"
|
||||
fancy-regex = { version = "0.10", optional = true}
|
||||
getrandom = { version = "0.2.6" }
|
||||
esaxx-rs = { version = "0.1", default-features = false, features=[]}
|
||||
|
||||
[features]
|
||||
default = ["progressbar", "http", "cli"]
|
||||
default = ["progressbar", "http", "cli", "onig", "esaxx_fast"]
|
||||
esaxx_fast = ["esaxx-rs/cpp"]
|
||||
progressbar = ["indicatif"]
|
||||
http = ["reqwest", "cached-path"]
|
||||
cli = ["clap"]
|
||||
unstable_wasm = ["fancy-regex", "getrandom/js"]
|
||||
|
||||
[dev-dependencies]
|
||||
criterion = "0.3"
|
||||
|
6
tokenizers/examples/unstable_wasm/.gitignore
vendored
Normal file
6
tokenizers/examples/unstable_wasm/.gitignore
vendored
Normal file
@ -0,0 +1,6 @@
|
||||
/target
|
||||
**/*.rs.bk
|
||||
Cargo.lock
|
||||
bin/
|
||||
pkg/
|
||||
wasm-pack.log
|
36
tokenizers/examples/unstable_wasm/Cargo.toml
Normal file
36
tokenizers/examples/unstable_wasm/Cargo.toml
Normal file
@ -0,0 +1,36 @@
|
||||
[package]
|
||||
name = "unstable_wasm"
|
||||
version = "0.1.0"
|
||||
authors = ["Nicolas Patry"]
|
||||
edition = "2018"
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib", "rlib"]
|
||||
|
||||
[features]
|
||||
default = ["console_error_panic_hook"]
|
||||
|
||||
[dependencies]
|
||||
wasm-bindgen = "0.2.63"
|
||||
|
||||
# The `console_error_panic_hook` crate provides better debugging of panics by
|
||||
# logging them with `console.error`. This is great for development, but requires
|
||||
# all the `std::fmt` and `std::panicking` infrastructure, so isn't great for
|
||||
# code size when deploying.
|
||||
console_error_panic_hook = { version = "0.1.6", optional = true }
|
||||
|
||||
# `wee_alloc` is a tiny allocator for wasm that is only ~1K in code size
|
||||
# compared to the default allocator's ~10K. It is slower than the default
|
||||
# allocator, however.
|
||||
#
|
||||
# Unfortunately, `wee_alloc` requires nightly Rust when targeting wasm for now.
|
||||
wee_alloc = { version = "0.4.5", optional = true }
|
||||
|
||||
tokenizers = { path = "../../", default-features=false, features = ["unstable_wasm"]}
|
||||
|
||||
[dev-dependencies]
|
||||
wasm-bindgen-test = "0.3.13"
|
||||
|
||||
[profile.release]
|
||||
# Tell `rustc` to optimize for small code size.
|
||||
opt-level = "s"
|
73
tokenizers/examples/unstable_wasm/README.md
Normal file
73
tokenizers/examples/unstable_wasm/README.md
Normal file
@ -0,0 +1,73 @@
|
||||
<div align="center">
|
||||
|
||||
<h1><code>wasm-pack-template</code></h1>
|
||||
|
||||
<strong>A template for kick starting a Rust and WebAssembly project using <a href="https://github.com/rustwasm/wasm-pack">wasm-pack</a>.</strong>
|
||||
|
||||
<p>
|
||||
<a href="https://travis-ci.org/rustwasm/wasm-pack-template"><img src="https://img.shields.io/travis/rustwasm/wasm-pack-template.svg?style=flat-square" alt="Build Status" /></a>
|
||||
</p>
|
||||
|
||||
<h3>
|
||||
<a href="https://rustwasm.github.io/docs/wasm-pack/tutorials/npm-browser-packages/index.html">Tutorial</a>
|
||||
<span> | </span>
|
||||
<a href="https://discordapp.com/channels/442252698964721669/443151097398296587">Chat</a>
|
||||
</h3>
|
||||
|
||||
<sub>Built with 🦀🕸 by <a href="https://rustwasm.github.io/">The Rust and WebAssembly Working Group</a></sub>
|
||||
</div>
|
||||
|
||||
## About
|
||||
|
||||
|
||||
This is an example project showing off a very basic use case for `wasm` tokenizers
|
||||
usage.
|
||||
|
||||
[**📚 Read this template tutorial! 📚**][template-docs]
|
||||
|
||||
This template is designed for compiling Rust libraries into WebAssembly and
|
||||
publishing the resulting package to NPM.
|
||||
|
||||
Be sure to check out [other `wasm-pack` tutorials online][tutorials] for other
|
||||
templates and usages of `wasm-pack`.
|
||||
|
||||
[tutorials]: https://rustwasm.github.io/docs/wasm-pack/tutorials/index.html
|
||||
[template-docs]: https://rustwasm.github.io/docs/wasm-pack/tutorials/npm-browser-packages/index.html
|
||||
|
||||
## 🚴 Usage
|
||||
|
||||
### 🐑 Use `cargo generate` to Clone this Template
|
||||
|
||||
[Learn more about `cargo generate` here.](https://github.com/ashleygwilliams/cargo-generate)
|
||||
|
||||
```
|
||||
cargo generate --git https://github.com/rustwasm/wasm-pack-template.git --name my-project
|
||||
cd my-project
|
||||
```
|
||||
|
||||
### 🛠️ Build with `wasm-pack build`
|
||||
|
||||
```
|
||||
wasm-pack build
|
||||
```
|
||||
|
||||
### 🔬 Test in Headless Browsers with `wasm-pack test`
|
||||
|
||||
```
|
||||
wasm-pack test --headless --firefox
|
||||
```
|
||||
|
||||
### 🎁 Publish to NPM with `wasm-pack publish`
|
||||
|
||||
```
|
||||
wasm-pack publish
|
||||
```
|
||||
|
||||
## 🔋 Batteries Included
|
||||
|
||||
* [`wasm-bindgen`](https://github.com/rustwasm/wasm-bindgen) for communicating
|
||||
between WebAssembly and JavaScript.
|
||||
* [`console_error_panic_hook`](https://github.com/rustwasm/console_error_panic_hook)
|
||||
for logging panic messages to the developer console.
|
||||
* [`wee_alloc`](https://github.com/rustwasm/wee_alloc), an allocator optimized
|
||||
for small code size.
|
44
tokenizers/examples/unstable_wasm/src/lib.rs
Normal file
44
tokenizers/examples/unstable_wasm/src/lib.rs
Normal file
@ -0,0 +1,44 @@
|
||||
mod utils;
|
||||
use tokenizers::models::bpe::{Vocab, BPE};
|
||||
use tokenizers::Tokenizer;
|
||||
|
||||
use wasm_bindgen::prelude::*;
|
||||
|
||||
// When the `wee_alloc` feature is enabled, use `wee_alloc` as the global
|
||||
// allocator.
|
||||
#[cfg(feature = "wee_alloc")]
|
||||
#[global_allocator]
|
||||
static ALLOC: wee_alloc::WeeAlloc = wee_alloc::WeeAlloc::INIT;
|
||||
|
||||
#[wasm_bindgen]
|
||||
pub fn tokenize(string: &str) -> Vec<u32> {
|
||||
let vocab: Vocab = vec![
|
||||
("a".to_string(), 0),
|
||||
("##b".to_string(), 1),
|
||||
("##c".to_string(), 2),
|
||||
("ab".to_string(), 3),
|
||||
("abc".to_string(), 4),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let merges = vec![
|
||||
("a".to_string(), "##b".to_string()),
|
||||
("ab".to_string(), "##c".to_string()),
|
||||
];
|
||||
|
||||
let bpe = BPE::builder()
|
||||
.vocab_and_merges(vocab, merges)
|
||||
.unk_token("[UNK]".to_string())
|
||||
.continuing_subword_prefix("##".to_string())
|
||||
.build()
|
||||
.unwrap();
|
||||
let tokenizer = Tokenizer::new(bpe);
|
||||
tokenizer
|
||||
.encode(string, false)
|
||||
.unwrap()
|
||||
.get_ids()
|
||||
.into_iter()
|
||||
.cloned()
|
||||
.collect()
|
||||
}
|
10
tokenizers/examples/unstable_wasm/src/utils.rs
Normal file
10
tokenizers/examples/unstable_wasm/src/utils.rs
Normal file
@ -0,0 +1,10 @@
|
||||
pub fn set_panic_hook() {
|
||||
// When the `console_error_panic_hook` feature is enabled, we can call the
|
||||
// `set_panic_hook` function at least once during initialization, and then
|
||||
// we will get better error messages if our code ever panics.
|
||||
//
|
||||
// For more details see
|
||||
// https://github.com/rustwasm/console_error_panic_hook#readme
|
||||
#[cfg(feature = "console_error_panic_hook")]
|
||||
console_error_panic_hook::set_once();
|
||||
}
|
13
tokenizers/examples/unstable_wasm/tests/web.rs
Normal file
13
tokenizers/examples/unstable_wasm/tests/web.rs
Normal file
@ -0,0 +1,13 @@
|
||||
//! Test suite for the Web and headless browsers.
|
||||
|
||||
#![cfg(target_arch = "wasm32")]
|
||||
|
||||
extern crate wasm_bindgen_test;
|
||||
use wasm_bindgen_test::*;
|
||||
|
||||
wasm_bindgen_test_configure!(run_in_browser);
|
||||
|
||||
#[wasm_bindgen_test]
|
||||
fn pass() {
|
||||
assert_eq!(1 + 1, 2);
|
||||
}
|
24
tokenizers/examples/unstable_wasm/www/.bin/create-wasm-app.js
Executable file
24
tokenizers/examples/unstable_wasm/www/.bin/create-wasm-app.js
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
const { spawn } = require("child_process");
|
||||
const fs = require("fs");
|
||||
|
||||
let folderName = '.';
|
||||
|
||||
if (process.argv.length >= 3) {
|
||||
folderName = process.argv[2];
|
||||
if (!fs.existsSync(folderName)) {
|
||||
fs.mkdirSync(folderName);
|
||||
}
|
||||
}
|
||||
|
||||
const clone = spawn("git", ["clone", "https://github.com/rustwasm/create-wasm-app.git", folderName]);
|
||||
|
||||
clone.on("close", code => {
|
||||
if (code !== 0) {
|
||||
console.error("cloning the template failed!")
|
||||
process.exit(code);
|
||||
} else {
|
||||
console.log("🦀 Rust + 🕸 Wasm = ❤");
|
||||
}
|
||||
});
|
2
tokenizers/examples/unstable_wasm/www/.gitignore
vendored
Normal file
2
tokenizers/examples/unstable_wasm/www/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
node_modules
|
||||
dist
|
5
tokenizers/examples/unstable_wasm/www/.travis.yml
Normal file
5
tokenizers/examples/unstable_wasm/www/.travis.yml
Normal file
@ -0,0 +1,5 @@
|
||||
language: node_js
|
||||
node_js: "10"
|
||||
|
||||
script:
|
||||
- ./node_modules/.bin/webpack
|
201
tokenizers/examples/unstable_wasm/www/LICENSE-APACHE
Normal file
201
tokenizers/examples/unstable_wasm/www/LICENSE-APACHE
Normal file
@ -0,0 +1,201 @@
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
25
tokenizers/examples/unstable_wasm/www/LICENSE-MIT
Normal file
25
tokenizers/examples/unstable_wasm/www/LICENSE-MIT
Normal file
@ -0,0 +1,25 @@
|
||||
Copyright (c) [year] [name]
|
||||
|
||||
Permission is hereby granted, free of charge, to any
|
||||
person obtaining a copy of this software and associated
|
||||
documentation files (the "Software"), to deal in the
|
||||
Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge,
|
||||
publish, distribute, sublicense, and/or sell copies of
|
||||
the Software, and to permit persons to whom the Software
|
||||
is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice
|
||||
shall be included in all copies or substantial portions
|
||||
of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
||||
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
||||
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
||||
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
||||
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
||||
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
67
tokenizers/examples/unstable_wasm/www/README.md
Normal file
67
tokenizers/examples/unstable_wasm/www/README.md
Normal file
@ -0,0 +1,67 @@
|
||||
<div align="center">
|
||||
|
||||
<h1><code>create-wasm-app</code></h1>
|
||||
|
||||
<strong>An <code>npm init</code> template for kick starting a project that uses NPM packages containing Rust-generated WebAssembly and bundles them with Webpack.</strong>
|
||||
|
||||
<p>
|
||||
<a href="https://travis-ci.org/rustwasm/create-wasm-app"><img src="https://img.shields.io/travis/rustwasm/create-wasm-app.svg?style=flat-square" alt="Build Status" /></a>
|
||||
</p>
|
||||
|
||||
<h3>
|
||||
<a href="#usage">Usage</a>
|
||||
<span> | </span>
|
||||
<a href="https://discordapp.com/channels/442252698964721669/443151097398296587">Chat</a>
|
||||
</h3>
|
||||
|
||||
<sub>Built with 🦀🕸 by <a href="https://rustwasm.github.io/">The Rust and WebAssembly Working Group</a></sub>
|
||||
</div>
|
||||
|
||||
## About
|
||||
|
||||
This template is designed for depending on NPM packages that contain
|
||||
Rust-generated WebAssembly and using them to create a Website.
|
||||
|
||||
* Want to create an NPM package with Rust and WebAssembly? [Check out
|
||||
`wasm-pack-template`.](https://github.com/rustwasm/wasm-pack-template)
|
||||
* Want to make a monorepo-style Website without publishing to NPM? Check out
|
||||
[`rust-webpack-template`](https://github.com/rustwasm/rust-webpack-template)
|
||||
and/or
|
||||
[`rust-parcel-template`](https://github.com/rustwasm/rust-parcel-template).
|
||||
|
||||
## 🚴 Usage
|
||||
|
||||
```
|
||||
npm init wasm-app
|
||||
```
|
||||
|
||||
## 🔋 Batteries Included
|
||||
|
||||
- `.gitignore`: ignores `node_modules`
|
||||
- `LICENSE-APACHE` and `LICENSE-MIT`: most Rust projects are licensed this way, so these are included for you
|
||||
- `README.md`: the file you are reading now!
|
||||
- `index.html`: a bare bones html document that includes the webpack bundle
|
||||
- `index.js`: example js file with a comment showing how to import and use a wasm pkg
|
||||
- `package.json` and `package-lock.json`:
|
||||
- pulls in devDependencies for using webpack:
|
||||
- [`webpack`](https://www.npmjs.com/package/webpack)
|
||||
- [`webpack-cli`](https://www.npmjs.com/package/webpack-cli)
|
||||
- [`webpack-dev-server`](https://www.npmjs.com/package/webpack-dev-server)
|
||||
- defines a `start` script to run `webpack-dev-server`
|
||||
- `webpack.config.js`: configuration file for bundling your js with webpack
|
||||
|
||||
## License
|
||||
|
||||
Licensed under either of
|
||||
|
||||
* Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0)
|
||||
* MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT)
|
||||
|
||||
at your option.
|
||||
|
||||
### Contribution
|
||||
|
||||
Unless you explicitly state otherwise, any contribution intentionally
|
||||
submitted for inclusion in the work by you, as defined in the Apache-2.0
|
||||
license, shall be dual licensed as above, without any additional terms or
|
||||
conditions.
|
5
tokenizers/examples/unstable_wasm/www/bootstrap.js
vendored
Normal file
5
tokenizers/examples/unstable_wasm/www/bootstrap.js
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
// A dependency graph that contains any wasm must all be imported
|
||||
// asynchronously. This `bootstrap.js` file does the single async import, so
|
||||
// that no one else needs to worry about it again.
|
||||
import("./index.js")
|
||||
.catch(e => console.error("Error importing `index.js`:", e));
|
11
tokenizers/examples/unstable_wasm/www/index.html
Normal file
11
tokenizers/examples/unstable_wasm/www/index.html
Normal file
@ -0,0 +1,11 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Hello wasm-pack!</title>
|
||||
</head>
|
||||
<body>
|
||||
<noscript>This page contains webassembly and javascript content, please enable javascript in your browser.</noscript>
|
||||
<script src="./bootstrap.js"></script>
|
||||
</body>
|
||||
</html>
|
4
tokenizers/examples/unstable_wasm/www/index.js
Normal file
4
tokenizers/examples/unstable_wasm/www/index.js
Normal file
@ -0,0 +1,4 @@
|
||||
import * as wasm from "unstable_wasm";
|
||||
|
||||
console.log(wasm.tokenize("ab"));
|
||||
console.log(wasm.tokenize("abc"));
|
12130
tokenizers/examples/unstable_wasm/www/package-lock.json
generated
Normal file
12130
tokenizers/examples/unstable_wasm/www/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
33
tokenizers/examples/unstable_wasm/www/package.json
Normal file
33
tokenizers/examples/unstable_wasm/www/package.json
Normal file
@ -0,0 +1,33 @@
|
||||
{
|
||||
"name": "create-wasm-app",
|
||||
"version": "0.1.0",
|
||||
"description": "create an app to consume rust-generated wasm packages",
|
||||
"main": "index.js",
|
||||
"bin": {
|
||||
"create-wasm-app": ".bin/create-wasm-app.js"
|
||||
},
|
||||
"scripts": {
|
||||
"build": "webpack --config webpack.config.js",
|
||||
"start": "NODE_OPTIONS=--openssl-legacy-provider webpack-dev-server"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "git+https://github.com/rustwasm/create-wasm-app.git"
|
||||
},
|
||||
"keywords": ["webassembly", "wasm", "rust", "webpack"],
|
||||
"author": "Ashley Williams <ashley666ashley@gmail.com>",
|
||||
"license": "(MIT OR Apache-2.0)",
|
||||
"bugs": {
|
||||
"url": "https://github.com/rustwasm/create-wasm-app/issues"
|
||||
},
|
||||
"homepage": "https://github.com/rustwasm/create-wasm-app#readme",
|
||||
"devDependencies": {
|
||||
"copy-webpack-plugin": "^5.0.0",
|
||||
"webpack": "^4.29.3",
|
||||
"webpack-cli": "^3.1.0",
|
||||
"webpack-dev-server": "^3.1.5"
|
||||
},
|
||||
"dependencies": {
|
||||
"unstable_wasm": "file:../pkg"
|
||||
}
|
||||
}
|
14
tokenizers/examples/unstable_wasm/www/webpack.config.js
Normal file
14
tokenizers/examples/unstable_wasm/www/webpack.config.js
Normal file
@ -0,0 +1,14 @@
|
||||
const CopyWebpackPlugin = require("copy-webpack-plugin");
|
||||
const path = require('path');
|
||||
|
||||
module.exports = {
|
||||
entry: "./bootstrap.js",
|
||||
output: {
|
||||
path: path.resolve(__dirname, "dist"),
|
||||
filename: "bootstrap.js",
|
||||
},
|
||||
mode: "development",
|
||||
plugins: [
|
||||
new CopyWebpackPlugin(['index.html'])
|
||||
],
|
||||
};
|
@ -209,7 +209,10 @@ impl UnigramTrainer {
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(feature = "esaxx_fast")]
|
||||
let suffix = esaxx_rs::suffix(&flat_string).unwrap();
|
||||
#[cfg(not(feature = "esaxx_fast"))]
|
||||
let suffix = esaxx_rs::suffix_rs(&flat_string).unwrap();
|
||||
|
||||
// Basic chars need to be in sentence pieces.
|
||||
let mut seed_sentencepieces: Vec<SentencePiece> = vec![];
|
||||
|
@ -1,5 +1,5 @@
|
||||
use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
||||
use onig::Regex;
|
||||
use crate::utils::SysRegex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Represents the different patterns that `Replace` can use
|
||||
@ -46,7 +46,7 @@ pub struct Replace {
|
||||
pattern: ReplacePattern,
|
||||
content: String,
|
||||
#[serde(skip)]
|
||||
regex: Regex,
|
||||
regex: SysRegex,
|
||||
}
|
||||
|
||||
impl Clone for Replace {
|
||||
@ -65,8 +65,8 @@ impl Replace {
|
||||
pub fn new<I: Into<ReplacePattern>, C: Into<String>>(pattern: I, content: C) -> Result<Self> {
|
||||
let pattern: ReplacePattern = pattern.into();
|
||||
let regex = match &pattern {
|
||||
ReplacePattern::String(s) => Regex::new(®ex::escape(s))?,
|
||||
ReplacePattern::Regex(r) => Regex::new(r)?,
|
||||
ReplacePattern::String(s) => SysRegex::new(®ex::escape(s))?,
|
||||
ReplacePattern::Regex(r) => SysRegex::new(r)?,
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use onig::Regex;
|
||||
use crate::utils::SysRegex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::tokenizer::{
|
||||
@ -33,9 +33,10 @@ fn bytes_char() -> HashMap<u8, char> {
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref RE: Regex =
|
||||
Regex::new(r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+")
|
||||
.unwrap();
|
||||
static ref RE: SysRegex = SysRegex::new(
|
||||
r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
|
||||
)
|
||||
.unwrap();
|
||||
static ref BYTES_CHAR: HashMap<u8, char> = bytes_char();
|
||||
static ref CHAR_BYTES: HashMap<char, u8> =
|
||||
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
|
||||
@ -111,7 +112,7 @@ impl ByteLevel {
|
||||
// TODO: Give the ability to modify this regex
|
||||
impl PreTokenizer for ByteLevel {
|
||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||
let re_ref: &Regex = &RE;
|
||||
let re_ref: &SysRegex = &RE;
|
||||
pretokenized.split(|_, mut normalized| {
|
||||
if self.add_prefix_space && !normalized.get().starts_with(' ') {
|
||||
normalized.prepend(" ");
|
||||
|
@ -1,4 +1,4 @@
|
||||
use onig::Regex;
|
||||
use crate::utils::SysRegex;
|
||||
use serde::{Deserialize, Deserializer, Serialize};
|
||||
|
||||
use crate::tokenizer::{
|
||||
@ -29,7 +29,7 @@ impl From<&str> for SplitPattern {
|
||||
pub struct Split {
|
||||
pattern: SplitPattern,
|
||||
#[serde(skip)]
|
||||
regex: Regex,
|
||||
regex: SysRegex,
|
||||
behavior: SplitDelimiterBehavior,
|
||||
invert: bool,
|
||||
}
|
||||
@ -80,8 +80,8 @@ impl Split {
|
||||
) -> Result<Self> {
|
||||
let pattern: SplitPattern = pattern.into();
|
||||
let regex = match &pattern {
|
||||
SplitPattern::String(s) => Regex::new(®ex::escape(s))?,
|
||||
SplitPattern::Regex(r) => Regex::new(r)?,
|
||||
SplitPattern::String(s) => SysRegex::new(®ex::escape(s))?,
|
||||
SplitPattern::Regex(r) => SysRegex::new(r)?,
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
|
@ -1,3 +1,4 @@
|
||||
use crate::utils::SysRegex;
|
||||
use crate::{Offsets, Result};
|
||||
use regex::Regex;
|
||||
|
||||
@ -59,7 +60,7 @@ impl Pattern for &Regex {
|
||||
}
|
||||
}
|
||||
|
||||
impl Pattern for &onig::Regex {
|
||||
impl Pattern for &SysRegex {
|
||||
fn find_matches(&self, inside: &str) -> Result<Vec<(Offsets, bool)>> {
|
||||
if inside.is_empty() {
|
||||
return Ok(vec![((0, 0), false)]);
|
||||
@ -205,8 +206,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn onig_regex() {
|
||||
let is_whitespace = onig::Regex::new(r"\s+").unwrap();
|
||||
fn sys_regex() {
|
||||
let is_whitespace = SysRegex::new(r"\s+").unwrap();
|
||||
do_test!("a b", &is_whitespace => vec![((0, 1), false), ((1, 4), true), ((4, 5), false)]);
|
||||
do_test!(" a b ", &is_whitespace =>
|
||||
vec![((0, 3), true), ((3, 4), false), ((4, 7), true), ((7, 8), false), ((8, 11), true)]
|
||||
|
33
tokenizers/src/utils/fancy.rs
Normal file
33
tokenizers/src/utils/fancy.rs
Normal file
@ -0,0 +1,33 @@
|
||||
use fancy_regex::Regex;
|
||||
use std::error::Error;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SysRegex {
|
||||
regex: Regex,
|
||||
}
|
||||
|
||||
impl SysRegex {
|
||||
pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> Matches<'r, 't> {
|
||||
Matches(self.regex.find_iter(inside))
|
||||
}
|
||||
|
||||
pub fn new(regex_str: &str) -> Result<Self, Box<dyn Error + Send + Sync + 'static>> {
|
||||
Ok(Self {
|
||||
regex: Regex::new(regex_str)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>);
|
||||
|
||||
impl<'r, 't> Iterator for Matches<'r, 't> {
|
||||
type Item = (usize, usize);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.0.next() {
|
||||
Some(Ok(mat)) => Some((mat.start(), mat.end())),
|
||||
// stop if an error is encountered
|
||||
None | Some(Err(_)) => None,
|
||||
}
|
||||
}
|
||||
}
|
@ -1,6 +1,16 @@
|
||||
pub(crate) mod cache;
|
||||
#[cfg(feature = "http")]
|
||||
pub(crate) mod from_pretrained;
|
||||
|
||||
#[cfg(feature = "unstable_wasm")]
|
||||
mod fancy;
|
||||
#[cfg(feature = "unstable_wasm")]
|
||||
pub use fancy::SysRegex;
|
||||
#[cfg(not(feature = "unstable_wasm"))]
|
||||
mod onig;
|
||||
#[cfg(not(feature = "unstable_wasm"))]
|
||||
pub use crate::utils::onig::SysRegex;
|
||||
|
||||
pub mod iter;
|
||||
pub mod padding;
|
||||
pub mod parallelism;
|
||||
|
45
tokenizers/src/utils/onig.rs
Normal file
45
tokenizers/src/utils/onig.rs
Normal file
@ -0,0 +1,45 @@
|
||||
use crate::tokenizer::pattern::Pattern;
|
||||
use crate::{Offsets, Result};
|
||||
use onig::Regex;
|
||||
use std::error::Error;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct SysRegex {
|
||||
regex: Regex,
|
||||
}
|
||||
|
||||
impl SysRegex {
|
||||
pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> onig::FindMatches<'r, 't> {
|
||||
self.regex.find_iter(inside)
|
||||
}
|
||||
|
||||
pub fn new(
|
||||
regex_str: &str,
|
||||
) -> std::result::Result<Self, Box<dyn Error + Send + Sync + 'static>> {
|
||||
Ok(Self {
|
||||
regex: Regex::new(regex_str)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Pattern for &Regex {
|
||||
fn find_matches(&self, inside: &str) -> Result<Vec<(Offsets, bool)>> {
|
||||
if inside.is_empty() {
|
||||
return Ok(vec![((0, 0), false)]);
|
||||
}
|
||||
|
||||
let mut prev = 0;
|
||||
let mut splits = Vec::with_capacity(inside.len());
|
||||
for (start, end) in self.find_iter(inside) {
|
||||
if prev != start {
|
||||
splits.push(((prev, start), false));
|
||||
}
|
||||
splits.push(((start, end), true));
|
||||
prev = end;
|
||||
}
|
||||
if prev != inside.len() {
|
||||
splits.push(((prev, inside.len()), false))
|
||||
}
|
||||
Ok(splits)
|
||||
}
|
||||
}
|
@ -12,11 +12,12 @@ use tokenizers::normalizers::unicode::{NFC, NFKC};
|
||||
use tokenizers::normalizers::NormalizerWrapper;
|
||||
use tokenizers::pre_tokenizers::bert::BertPreTokenizer;
|
||||
use tokenizers::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||
use tokenizers::pre_tokenizers::split::{Split, SplitPattern};
|
||||
use tokenizers::pre_tokenizers::whitespace::Whitespace;
|
||||
use tokenizers::pre_tokenizers::PreTokenizerWrapper;
|
||||
use tokenizers::processors::bert::BertProcessing;
|
||||
use tokenizers::processors::PostProcessorWrapper;
|
||||
use tokenizers::{Tokenizer, TokenizerImpl};
|
||||
use tokenizers::{SplitDelimiterBehavior, Tokenizer, TokenizerImpl};
|
||||
|
||||
#[test]
|
||||
fn bpe_serde() {
|
||||
@ -47,6 +48,7 @@ fn normalizers() {
|
||||
// Test unit struct
|
||||
let nfc = NFC;
|
||||
let nfc_ser = serde_json::to_string(&nfc).unwrap();
|
||||
assert_eq!(nfc_ser, r#"{"type":"NFC"}"#);
|
||||
// empty struct can deserialize from self
|
||||
serde_json::from_str::<NFC>(&nfc_ser).unwrap();
|
||||
let err: Result<NFKC, _> = serde_json::from_str(&nfc_ser);
|
||||
@ -63,6 +65,10 @@ fn normalizers() {
|
||||
// Test non-empty roundtrip
|
||||
let bert = BertNormalizer::default();
|
||||
let bert_ser = serde_json::to_string(&bert).unwrap();
|
||||
assert_eq!(
|
||||
bert_ser,
|
||||
r#"{"type":"BertNormalizer","clean_text":true,"handle_chinese_chars":true,"strip_accents":null,"lowercase":true}"#
|
||||
);
|
||||
// make sure we can deserialize to self
|
||||
serde_json::from_str::<BertNormalizer>(&bert_ser).unwrap();
|
||||
// wrapper can deserialize from inner serialization
|
||||
@ -80,6 +86,10 @@ fn normalizers() {
|
||||
fn processors() {
|
||||
let bert = BertProcessing::new(("SEP".into(), 0), ("CLS".into(), 0));
|
||||
let bert_ser = serde_json::to_string(&bert).unwrap();
|
||||
assert_eq!(
|
||||
bert_ser,
|
||||
r#"{"type":"BertProcessing","sep":["SEP",0],"cls":["CLS",0]}"#
|
||||
);
|
||||
serde_json::from_str::<BertProcessing>(&bert_ser).unwrap();
|
||||
let bert_wrapped: PostProcessorWrapper = serde_json::from_str(&bert_ser).unwrap();
|
||||
match &bert_wrapped {
|
||||
@ -95,6 +105,7 @@ fn pretoks() {
|
||||
// Test unit struct
|
||||
let bert = BertPreTokenizer;
|
||||
let bert_ser = serde_json::to_string(&bert).unwrap();
|
||||
assert_eq!(bert_ser, r#"{"type":"BertPreTokenizer"}"#);
|
||||
// empty struct can deserialize from self
|
||||
serde_json::from_str::<BertPreTokenizer>(&bert_ser).unwrap();
|
||||
let err: Result<Whitespace, _> = serde_json::from_str(&bert_ser);
|
||||
@ -114,6 +125,7 @@ fn pretoks() {
|
||||
// Test non-empty roundtrip
|
||||
let ch = CharDelimiterSplit::new(' ');
|
||||
let ch_ser = serde_json::to_string(&ch).unwrap();
|
||||
assert_eq!(ch_ser, r#"{"type":"CharDelimiterSplit","delimiter":" "}"#);
|
||||
// make sure we can deserialize to self
|
||||
serde_json::from_str::<CharDelimiterSplit>(&ch_ser).unwrap();
|
||||
// wrapper can deserialize from inner serialization
|
||||
@ -128,18 +140,41 @@ fn pretoks() {
|
||||
|
||||
let wsp = Whitespace::default();
|
||||
let wsp_ser = serde_json::to_string(&wsp).unwrap();
|
||||
assert_eq!(wsp_ser, r#"{"type":"Whitespace"}"#);
|
||||
serde_json::from_str::<Whitespace>(&wsp_ser).unwrap();
|
||||
let err: Result<BertPreTokenizer, _> = serde_json::from_str(&wsp_ser);
|
||||
assert!(
|
||||
err.is_err(),
|
||||
"BertPreTokenizer shouldn't be deserializable from Whitespace"
|
||||
);
|
||||
|
||||
let pattern: SplitPattern = "[SEP]".into();
|
||||
let pretok = Split::new(pattern, SplitDelimiterBehavior::Isolated, false).unwrap();
|
||||
let pretok_str = serde_json::to_string(&pretok).unwrap();
|
||||
assert_eq!(
|
||||
pretok_str,
|
||||
r#"{"type":"Split","pattern":{"String":"[SEP]"},"behavior":"Isolated","invert":false}"#
|
||||
);
|
||||
assert_eq!(serde_json::from_str::<Split>(&pretok_str).unwrap(), pretok);
|
||||
|
||||
let pattern = SplitPattern::Regex("[SEP]".to_string());
|
||||
let pretok = Split::new(pattern, SplitDelimiterBehavior::Isolated, false).unwrap();
|
||||
let pretok_str = serde_json::to_string(&pretok).unwrap();
|
||||
assert_eq!(
|
||||
pretok_str,
|
||||
r#"{"type":"Split","pattern":{"Regex":"[SEP]"},"behavior":"Isolated","invert":false}"#
|
||||
);
|
||||
assert_eq!(serde_json::from_str::<Split>(&pretok_str).unwrap(), pretok);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn decoders() {
|
||||
let byte_level = ByteLevel::default();
|
||||
let byte_level_ser = serde_json::to_string(&byte_level).unwrap();
|
||||
assert_eq!(
|
||||
byte_level_ser,
|
||||
r#"{"type":"ByteLevel","add_prefix_space":true,"trim_offsets":true,"use_regex":true}"#
|
||||
);
|
||||
serde_json::from_str::<ByteLevel>(&byte_level_ser).unwrap();
|
||||
let byte_level_wrapper: DecoderWrapper = serde_json::from_str(&byte_level_ser).unwrap();
|
||||
match &byte_level_wrapper {
|
||||
|
Reference in New Issue
Block a user