Ensure serialization works in all expected ways.

This commit is contained in:
Sebastian Puetz
2020-08-01 13:34:18 +02:00
committed by Anthony MOI
parent aaf8e932b1
commit 16f75d9efc
39 changed files with 1303 additions and 615 deletions

View File

@@ -22,7 +22,7 @@ version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"hermit-abi 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -67,13 +67,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"atty 0.2.14 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "cloudabi"
version = "0.0.3"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -87,7 +87,7 @@ dependencies = [
"clicolors-control 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)",
"encode_unicode 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"termios 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-width 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -170,7 +170,7 @@ version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"wasi 0.9.0+wasi-snapshot-preview1 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -189,7 +189,7 @@ name = "hermit-abi"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -224,6 +224,11 @@ dependencies = [
"unindent 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "instant"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "inventory"
version = "0.1.6"
@@ -272,12 +277,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "libc"
version = "0.2.68"
version = "0.2.74"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "lock_api"
version = "0.3.3"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -301,21 +306,13 @@ dependencies = [
"autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num-traits"
version = "0.2.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"autocfg 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "num_cpus"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"hermit-abi 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -330,7 +327,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"onig_sys 69.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -345,23 +342,25 @@ dependencies = [
[[package]]
name = "parking_lot"
version = "0.10.0"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"lock_api 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
"parking_lot_core 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
"instant 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"lock_api 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"parking_lot_core 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "parking_lot_core"
version = "0.7.0"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"cloudabi 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"instant 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)",
"smallvec 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"smallvec 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -410,26 +409,22 @@ dependencies = [
[[package]]
name = "pyo3"
version = "0.9.2"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"ctor 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
"indoc 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"inventory 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"num-traits 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)",
"parking_lot 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"parking_lot 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)",
"paste 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"pyo3cls 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)",
"regex 1.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.106 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.51 (registry+https://github.com/rust-lang/crates.io-index)",
"pyo3cls 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)",
"unindent 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)",
"version_check 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "pyo3-derive-backend"
version = "0.9.2"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"proc-macro2 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -439,10 +434,10 @@ dependencies = [
[[package]]
name = "pyo3cls"
version = "0.9.2"
version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"pyo3-derive-backend 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)",
"pyo3-derive-backend 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -461,7 +456,7 @@ version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"getrandom 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_chacha 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_hc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -585,7 +580,7 @@ dependencies = [
[[package]]
name = "smallvec"
version = "1.3.0"
version = "1.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@@ -608,7 +603,7 @@ name = "termios"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -643,7 +638,6 @@ dependencies = [
"regex-syntax 0.6.17 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.106 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.51 (registry+https://github.com/rust-lang/crates.io-index)",
"typetag 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode-normalization-alignments 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
"unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
@@ -652,8 +646,8 @@ dependencies = [
name = "tokenizers-python"
version = "0.8.1"
dependencies = [
"libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)",
"pyo3 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"pyo3 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)",
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.106 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.51 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -688,7 +682,7 @@ name = "unicode-normalization-alignments"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"smallvec 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"smallvec 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -716,11 +710,6 @@ name = "vec_map"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "version_check"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
@@ -755,7 +744,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
"checksum clicolors-control 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "90082ee5dcdd64dc4e9e0d37fbf3ee325419e39c0092191e0393df65518f741e"
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
"checksum cloudabi 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4344512281c643ae7638bbabc3af17a11307803ec8f0fcad9fae512a8bf36467"
"checksum console 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6728a28023f207181b193262711102bfbaf47cc9d13bc71d0736607ef8efe88c"
"checksum crossbeam-deque 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "9f02af974daeee82218205558e51ec8768b48cf524bd01d550abe5573a608285"
"checksum crossbeam-epoch 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "058ed274caafc1f60c4997b5fc07bf7dc7cca454af7c6e81edffe5f33f70dace"
@@ -771,33 +760,33 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum indicatif 0.14.0 (registry+https://github.com/rust-lang/crates.io-index)" = "49a68371cf417889c9d7f98235b7102ea7c54fc59bcbd22f3dea785be9d27e40"
"checksum indoc 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "79255cf29f5711995ddf9ec261b4057b1deb34e66c90656c201e41376872c544"
"checksum indoc-impl 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "54554010aa3d17754e484005ea0022f1c93839aabc627c2c55f3d7b47206134c"
"checksum instant 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5b141fdc7836c525d4d594027d318c84161ca17aaf8113ab1f81ab93ae897485"
"checksum inventory 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "82d3f4b90287725c97b17478c60dda0c6324e7c84ee1ed72fb9179d0fdf13956"
"checksum inventory-impl 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9092a4fefc9d503e9287ef137f03180a6e7d1b04c419563171ee14947c5e80ec"
"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
"checksum itertools 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "284f18f85651fe11e8a991b2adb42cb078325c996ed026d994719efcfca1d54b"
"checksum itoa 0.4.5 (registry+https://github.com/rust-lang/crates.io-index)" = "b8b7a7c0c47db5545ed3fef7468ee7bb5b74691498139e4b3f6a20685dc6dd8e"
"checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
"checksum libc 0.2.68 (registry+https://github.com/rust-lang/crates.io-index)" = "dea0c0405123bba743ee3f91f49b1c7cfb684eef0da0a50110f758ccf24cdff0"
"checksum lock_api 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "79b2de95ecb4691949fea4716ca53cdbcfccb2c612e19644a8bad05edcf9f47b"
"checksum libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)" = "a2f02823cf78b754822df5f7f268fb59822e7296276d3e069d8e8cb26a14bd10"
"checksum lock_api 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "28247cc5a5be2f05fbcd76dd0cf2c7d3b5400cb978a28042abcd4fa0b3f8261c"
"checksum maybe-uninit 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
"checksum memchr 2.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3728d817d99e5ac407411fa471ff9800a778d88a24685968b36824eaf4bee400"
"checksum memoffset 0.5.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b4fc2c02a7e374099d4ee95a193111f72d2110197fe200272371758f6c3643d8"
"checksum num-traits 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "c62be47e61d1842b9170f0fdeec8eba98e60e90e5446449a0545e5152acd7096"
"checksum num_cpus 1.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "46203554f085ff89c235cd12f7075f3233af9b11ed7c9e16dfe2560d03313ce6"
"checksum number_prefix 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
"checksum onig 6.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bd91ccd8a02fce2f7e8a86655aec67bc6c171e6f8e704118a0e8c4b866a05a8a"
"checksum onig_sys 69.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3814583fad89f3c60ae0701d80e87e1fd3028741723deda72d0d4a0ecf0cb0db"
"checksum parking_lot 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "92e98c49ab0b7ce5b222f2cc9193fc4efe11c6d0bd4f648e374684a6857b1cfc"
"checksum parking_lot_core 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7582838484df45743c8434fbff785e8edf260c28748353d44bc0da32e0ceabf1"
"checksum parking_lot 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a4893845fa2ca272e647da5d0e46660a314ead9c2fdd9a883aabc32e481a8733"
"checksum parking_lot_core 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c361aa727dd08437f2f1447be8b59a33b0edd15e0fcee698f935613d9efbca9b"
"checksum paste 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "ab4fb1930692d1b6a9cfabdde3d06ea0a7d186518e2f4d67660d8970e2fa647a"
"checksum paste-impl 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "a62486e111e571b1e93b710b61e8f493c0013be39629b714cb166bdb06aa5a8a"
"checksum pkg-config 0.3.17 (registry+https://github.com/rust-lang/crates.io-index)" = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677"
"checksum ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "74490b50b9fbe561ac330df47c08f3f33073d2d00c150f719147d7c54522fa1b"
"checksum proc-macro-hack 0.5.15 (registry+https://github.com/rust-lang/crates.io-index)" = "0d659fe7c6d27f25e9d80a1a094c223f5246f6a6596453e09d7229bf42750b63"
"checksum proc-macro2 1.0.10 (registry+https://github.com/rust-lang/crates.io-index)" = "df246d292ff63439fea9bc8c0a270bed0e390d5ebd4db4ba15aba81111b5abe3"
"checksum pyo3 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7028df4086f1e488a6192932e86de604077ef6b06eac2b0f159a3082c7450c58"
"checksum pyo3-derive-backend 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4e3c7aaceb685d2560b7c3fc46c152464c181de2baf44e57119ce43d712d1b64"
"checksum pyo3cls 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "053d66146897d823e8d228758fb0aefac18e8a3024585a1640dbbe885c1b07a1"
"checksum pyo3 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)" = "9ca8710ffa8211c9a62a8a3863c4267c710dc42a82a7fd29c97de465d7ea6b7d"
"checksum pyo3-derive-backend 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)" = "58ad070bf6967b0d29ea74931ffcf9c6bbe8402a726e9afbeafadc0a287cc2b3"
"checksum pyo3cls 0.11.1 (registry+https://github.com/rust-lang/crates.io-index)" = "c3fa17e1ea569d0bf3b7c00f2a9eea831ca05e55dd76f1794c541abba1c64baa"
"checksum quote 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2bdc6c187c65bca4260c9011c9e3132efe4909da44726bad24cf7572ae338d7f"
"checksum rand 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
"checksum rand_chacha 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
@@ -814,7 +803,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum serde 1.0.106 (registry+https://github.com/rust-lang/crates.io-index)" = "36df6ac6412072f67cf767ebbde4133a5b2e88e76dc6187fa7104cd16f783399"
"checksum serde_derive 1.0.106 (registry+https://github.com/rust-lang/crates.io-index)" = "9e549e3abf4fb8621bd1609f11dfc9f5e50320802273b12f3811a67e6716ea6c"
"checksum serde_json 1.0.51 (registry+https://github.com/rust-lang/crates.io-index)" = "da07b57ee2623368351e9a0488bb0b261322a15a6e0ae53e243cbdc0f4208da9"
"checksum smallvec 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "05720e22615919e4734f6a99ceae50d00226c3c5aca406e102ebc33298214e0a"
"checksum smallvec 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3757cb9d89161a2f24e1cf78efa0c1fcff485d18e3f55e0aa3480824ddaa0f3f"
"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
"checksum syn 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)" = "0df0eb663f387145cab623dea85b09c2c5b4b0aef44e945d928e682fce71bb03"
"checksum termios 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f0fcee7b24a25675de40d5bb4de6e41b0df07bc9856295e7e2b3a3600c400c2"
@@ -828,7 +817,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
"checksum unindent 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "63f18aa3b0e35fed5a0048f029558b1518095ffe2a0a31fb87c93dece93a4993"
"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a"
"checksum version_check 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)" = "078775d0255232fb988e6fccf26ddc9d1ac274299aaedcedce21c6f72cc533ce"
"checksum wasi 0.9.0+wasi-snapshot-preview1 (registry+https://github.com/rust-lang/crates.io-index)" = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
"checksum winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"

View File

@@ -16,13 +16,15 @@ serde_json = "1.0"
libc = "0.2"
[dependencies.pyo3]
version = "0.9.2"
features = ["extension-module"]
version = "0.11"
[dependencies.tokenizers]
version = "*"
path = "../../tokenizers"
[features]
default = ["pyo3/extension-module"]
[target.x86_64-apple-darwin]
rustflags = [
"-C", "link-arg=-undefined",

View File

@@ -11,3 +11,4 @@ check-style:
# Launch the test suite
test:
python -m pytest -s -v tests
cargo test --no-default-features

View File

@@ -1 +1 @@
nightly-2020-05-14
stable

View File

@@ -9,55 +9,57 @@ use tk::decoders::bpe::BPEDecoder;
use tk::decoders::byte_level::ByteLevel;
use tk::decoders::metaspace::Metaspace;
use tk::decoders::wordpiece::WordPiece;
use tk::decoders::DecoderWrapper;
use tk::Decoder;
use tokenizers as tk;
use super::error::{PyError, ToPyResult};
#[pyclass(dict, module = "tokenizers.decoders", name=Decoder)]
#[derive(Clone)]
#[derive(Clone, Deserialize, Serialize)]
pub struct PyDecoder {
pub decoder: Arc<dyn Decoder>,
#[serde(flatten)]
pub(crate) decoder: PyDecoderWrapper,
}
impl PyDecoder {
pub fn new(decoder: Arc<dyn Decoder>) -> Self {
pub(crate) fn new(decoder: PyDecoderWrapper) -> Self {
PyDecoder { decoder }
}
pub(crate) fn get_as_subtype(&self) -> PyResult<PyObject> {
let base = self.clone();
let gil = Python::acquire_gil();
let py = gil.python();
match &self.decoder {
PyDecoderWrapper::Custom(_) => Py::new(py, base).map(Into::into),
PyDecoderWrapper::Wrapped(inner) => match inner.as_ref() {
DecoderWrapper::Metaspace(_) => {
Py::new(py, (PyMetaspaceDec {}, base)).map(Into::into)
}
DecoderWrapper::WordPiece(_) => {
Py::new(py, (PyWordPieceDec {}, base)).map(Into::into)
}
DecoderWrapper::ByteLevel(_) => {
Py::new(py, (PyByteLevelDec {}, base)).map(Into::into)
}
DecoderWrapper::BPE(_) => Py::new(py, (PyBPEDecoder {}, base)).map(Into::into),
},
}
}
}
#[typetag::serde]
impl Decoder for PyDecoder {
fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {
self.decoder.decode(tokens)
}
}
impl Serialize for PyDecoder {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.decoder.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for PyDecoder {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
Ok(PyDecoder {
decoder: Arc::deserialize(deserializer)?,
})
}
}
#[pymethods]
impl PyDecoder {
#[staticmethod]
fn custom(decoder: PyObject) -> PyResult<Self> {
let decoder = CustomDecoder::new(decoder).map(Arc::new)?;
let decoder = PyDecoderWrapper::Custom(CustomDecoder::new(decoder).map(Arc::new)?);
Ok(PyDecoder::new(decoder))
}
@@ -97,10 +99,7 @@ pub struct PyByteLevelDec {}
impl PyByteLevelDec {
#[new]
fn new() -> PyResult<(Self, PyDecoder)> {
Ok((
PyByteLevelDec {},
PyDecoder::new(Arc::new(ByteLevel::default())),
))
Ok((PyByteLevelDec {}, ByteLevel::default().into()))
}
}
@@ -123,10 +122,7 @@ impl PyWordPieceDec {
}
}
Ok((
PyWordPieceDec {},
PyDecoder::new(Arc::new(WordPiece::new(prefix, cleanup))),
))
Ok((PyWordPieceDec {}, WordPiece::new(prefix, cleanup).into()))
}
}
@@ -158,7 +154,7 @@ impl PyMetaspaceDec {
Ok((
PyMetaspaceDec {},
PyDecoder::new(Arc::new(Metaspace::new(replacement, add_prefix_space))),
Metaspace::new(replacement, add_prefix_space).into(),
))
}
}
@@ -182,24 +178,20 @@ impl PyBPEDecoder {
}
}
Ok((
PyBPEDecoder {},
PyDecoder::new(Arc::new(BPEDecoder::new(suffix))),
))
Ok((PyBPEDecoder {}, BPEDecoder::new(suffix).into()))
}
}
struct CustomDecoder {
pub(crate) struct CustomDecoder {
class: PyObject,
}
impl CustomDecoder {
pub fn new(class: PyObject) -> PyResult<Self> {
pub(crate) fn new(class: PyObject) -> PyResult<Self> {
Ok(CustomDecoder { class })
}
}
#[typetag::serde]
impl Decoder for CustomDecoder {
fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {
let gil = Python::acquire_gil();
@@ -240,3 +232,83 @@ impl<'de> Deserialize<'de> for CustomDecoder {
Err(D::Error::custom("PyDecoder cannot be deserialized"))
}
}
#[derive(Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub(crate) enum PyDecoderWrapper {
Custom(Arc<CustomDecoder>),
Wrapped(Arc<DecoderWrapper>),
}
impl<I> From<I> for PyDecoderWrapper
where
I: Into<DecoderWrapper>,
{
fn from(norm: I) -> Self {
PyDecoderWrapper::Wrapped(Arc::new(norm.into()))
}
}
impl<I> From<I> for PyDecoder
where
I: Into<DecoderWrapper>,
{
fn from(dec: I) -> Self {
PyDecoder {
decoder: dec.into().into(),
}
}
}
impl Decoder for PyDecoderWrapper {
fn decode(&self, tokens: Vec<String>) -> tk::Result<String> {
match self {
PyDecoderWrapper::Wrapped(inner) => inner.decode(tokens),
PyDecoderWrapper::Custom(inner) => inner.decode(tokens),
}
}
}
#[cfg(test)]
mod test {
use std::sync::Arc;
use pyo3::{AsPyRef, Py, PyObject, Python};
use tk::decoders::metaspace::Metaspace;
use tk::decoders::DecoderWrapper;
use crate::decoders::{CustomDecoder, PyDecoder, PyDecoderWrapper};
#[test]
fn get_subtype() {
let py_dec = PyDecoder::new(Metaspace::default().into());
let py_meta = py_dec.get_as_subtype().unwrap();
let gil = Python::acquire_gil();
assert_eq!(
"tokenizers.decoders.Metaspace",
py_meta.as_ref(gil.python()).get_type().name()
);
}
#[test]
fn serialize() {
let py_wrapped: PyDecoderWrapper = Metaspace::default().into();
let py_ser = serde_json::to_string(&py_wrapped).unwrap();
let rs_wrapped = DecoderWrapper::Metaspace(Metaspace::default());
let rs_ser = serde_json::to_string(&rs_wrapped).unwrap();
assert_eq!(py_ser, rs_ser);
let py_dec: PyDecoder = serde_json::from_str(&rs_ser).unwrap();
match py_dec.decoder {
PyDecoderWrapper::Wrapped(msp) => match msp.as_ref() {
DecoderWrapper::Metaspace(_) => {}
_ => panic!("Expected Whitespace"),
},
_ => panic!("Expected wrapped, not custom."),
}
let gil = Python::acquire_gil();
let py_msp = PyDecoder::new(Metaspace::default().into());
let obj: PyObject = Py::new(gil.python(), py_msp).unwrap().into();
let py_seq = PyDecoderWrapper::Custom(Arc::new(CustomDecoder::new(obj).unwrap()));
assert!(serde_json::to_string(&py_seq).is_err());
}
}

View File

@@ -6,7 +6,7 @@ use std::sync::Arc;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde::{Deserialize, Serialize};
use tk::models::bpe::BPE;
use tk::models::wordlevel::WordLevel;
use tk::models::wordpiece::WordPiece;
@@ -19,8 +19,9 @@ use tk::models::ModelWrapper;
/// A Model represents some tokenization algorithm like BPE or Word
/// This class cannot be constructed directly. Please use one of the concrete models.
#[pyclass(module = "tokenizers.models", name=Model)]
#[derive(Clone)]
#[derive(Clone, Serialize, Deserialize)]
pub struct PyModel {
#[serde(flatten)]
pub model: Arc<ModelWrapper>,
}
@@ -28,29 +29,19 @@ impl PyModel {
pub(crate) fn new(model: Arc<ModelWrapper>) -> Self {
PyModel { model }
}
}
impl Serialize for PyModel {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.model.serialize(serializer)
pub(crate) fn get_as_subtype(&self) -> PyResult<PyObject> {
let base = self.clone();
let gil = Python::acquire_gil();
let py = gil.python();
match self.model.as_ref() {
ModelWrapper::BPE(_) => Py::new(py, (PyBPE {}, base)).map(Into::into),
ModelWrapper::WordPiece(_) => Py::new(py, (PyWordPiece {}, base)).map(Into::into),
ModelWrapper::WordLevel(_) => Py::new(py, (PyWordLevel {}, base)).map(Into::into),
}
}
}
impl<'de> Deserialize<'de> for PyModel {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
Ok(PyModel {
model: Arc::deserialize(deserializer)?,
})
}
}
#[typetag::serde]
impl Model for PyModel {
fn tokenize(&self, tokens: &str) -> tk::Result<Vec<Token>> {
self.model.tokenize(tokens)
@@ -257,7 +248,55 @@ impl PyWordLevel {
Ok(model) => Ok((PyWordLevel {}, PyModel::new(Arc::new(model.into())))),
}
} else {
Ok((PyWordLevel {}, PyModel::new(Arc::new(WordLevel::default().into()))))
Ok((
PyWordLevel {},
PyModel::new(Arc::new(WordLevel::default().into())),
))
}
}
}
#[cfg(test)]
mod test {
use crate::models::PyModel;
use pyo3::{AsPyRef, Python};
use std::sync::Arc;
use tk::models::bpe::BPE;
use tk::models::ModelWrapper;
#[test]
fn get_subtype() {
let py_model = PyModel::new(Arc::new(BPE::default().into()));
let py_bpe = py_model.get_as_subtype().unwrap();
let gil = Python::acquire_gil();
assert_eq!(
"tokenizers.models.BPE",
py_bpe.as_ref(gil.python()).get_type().name()
);
}
#[test]
fn serialize() {
let rs_bpe = BPE::default();
let rs_bpe_ser = serde_json::to_string(&rs_bpe).unwrap();
let rs_wrapper: ModelWrapper = rs_bpe.clone().into();
let rs_wrapper_ser = serde_json::to_string(&rs_wrapper).unwrap();
let py_model = PyModel::new(Arc::new(rs_wrapper.clone()));
let py_ser = serde_json::to_string(&py_model).unwrap();
assert_eq!(py_ser, rs_bpe_ser);
assert_eq!(py_ser, rs_wrapper_ser);
let py_model: PyModel = serde_json::from_str(&rs_bpe_ser).unwrap();
match py_model.model.as_ref() {
ModelWrapper::BPE(_) => (),
_ => panic!("Expected Bert postprocessor."),
}
let py_model: PyModel = serde_json::from_str(&rs_wrapper_ser).unwrap();
match py_model.model.as_ref() {
ModelWrapper::BPE(_) => (),
_ => panic!("Expected Bert postprocessor."),
}
}
}

View File

@@ -4,51 +4,57 @@ use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use tk::normalizers::bert::BertNormalizer;
use tk::normalizers::strip::Strip;
use tk::normalizers::unicode::{NFC, NFD, NFKC, NFKD};
use tk::normalizers::utils::{Lowercase, Sequence};
use serde::ser::SerializeStruct;
use serde::{Deserialize, Serialize, Serializer};
use tk::normalizers::{BertNormalizer, Lowercase, NormalizerWrapper, Strip, NFC, NFD, NFKC, NFKD};
use tk::{NormalizedString, Normalizer};
use tokenizers as tk;
#[pyclass(dict, module = "tokenizers.normalizers", name=Normalizer)]
#[derive(Clone)]
#[derive(Clone, Serialize, Deserialize)]
pub struct PyNormalizer {
pub normalizer: Arc<dyn Normalizer>,
#[serde(flatten)]
pub(crate) normalizer: PyNormalizerWrapper,
}
impl PyNormalizer {
pub fn new(normalizer: Arc<dyn Normalizer>) -> Self {
pub(crate) fn new(normalizer: PyNormalizerWrapper) -> Self {
PyNormalizer { normalizer }
}
pub(crate) fn get_as_subtype(&self) -> PyResult<PyObject> {
let base = self.clone();
let gil = Python::acquire_gil();
let py = gil.python();
match self.normalizer {
PyNormalizerWrapper::Sequence(_) => Py::new(py, (PySequence {}, base)).map(Into::into),
PyNormalizerWrapper::Wrapped(ref inner) => match inner.as_ref() {
NormalizerWrapper::Sequence(_) => {
Py::new(py, (PySequence {}, base)).map(Into::into)
}
NormalizerWrapper::BertNormalizer(_) => {
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
}
NormalizerWrapper::StripNormalizer(_) => {
Py::new(py, (PyBertNormalizer {}, base)).map(Into::into)
}
NormalizerWrapper::NFC(_) => Py::new(py, (PyNFC {}, base)).map(Into::into),
NormalizerWrapper::NFD(_) => Py::new(py, (PyNFD {}, base)).map(Into::into),
NormalizerWrapper::NFKC(_) => Py::new(py, (PyNFKC {}, base)).map(Into::into),
NormalizerWrapper::NFKD(_) => Py::new(py, (PyNFKD {}, base)).map(Into::into),
NormalizerWrapper::Lowercase(_) => {
Py::new(py, (PyLowercase {}, base)).map(Into::into)
}
},
}
}
}
#[typetag::serde]
impl Normalizer for PyNormalizer {
fn normalize(&self, normalized: &mut NormalizedString) -> tk::Result<()> {
self.normalizer.normalize(normalized)
}
}
impl Serialize for PyNormalizer {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.normalizer.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for PyNormalizer {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
Ok(PyNormalizer::new(Arc::deserialize(deserializer)?))
}
}
#[pymethods]
impl PyNormalizer {
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
@@ -103,7 +109,7 @@ impl PyBertNormalizer {
}
let normalizer =
BertNormalizer::new(clean_text, handle_chinese_chars, strip_accents, lowercase);
Ok((PyBertNormalizer {}, PyNormalizer::new(Arc::new(normalizer))))
Ok((PyBertNormalizer {}, normalizer.into()))
}
}
@@ -113,7 +119,7 @@ pub struct PyNFD {}
impl PyNFD {
#[new]
fn new() -> PyResult<(Self, PyNormalizer)> {
Ok((PyNFD {}, PyNormalizer::new(Arc::new(NFD))))
Ok((PyNFD {}, PyNormalizer::new(NFD.into())))
}
}
@@ -123,7 +129,7 @@ pub struct PyNFKD {}
impl PyNFKD {
#[new]
fn new() -> PyResult<(Self, PyNormalizer)> {
Ok((PyNFKD {}, PyNormalizer::new(Arc::new(NFKD))))
Ok((PyNFKD {}, NFKD.into()))
}
}
@@ -133,7 +139,7 @@ pub struct PyNFC {}
impl PyNFC {
#[new]
fn new() -> PyResult<(Self, PyNormalizer)> {
Ok((PyNFC {}, PyNormalizer::new(Arc::new(NFC))))
Ok((PyNFC {}, NFC.into()))
}
}
@@ -143,7 +149,7 @@ pub struct PyNFKC {}
impl PyNFKC {
#[new]
fn new() -> PyResult<(Self, PyNormalizer)> {
Ok((PyNFKC {}, PyNormalizer::new(Arc::new(NFKC))))
Ok((PyNFKC {}, NFKC.into()))
}
}
@@ -153,19 +159,19 @@ pub struct PySequence {}
impl PySequence {
#[new]
fn new(normalizers: &PyList) -> PyResult<(Self, PyNormalizer)> {
let normalizers = normalizers
.iter()
.map(|n| {
let normalizer: PyRef<PyNormalizer> = n.extract()?;
let normalizer = PyNormalizer::new(normalizer.normalizer.clone());
let boxed = Box::new(normalizer);
Ok(boxed as Box<dyn Normalizer>)
})
.collect::<PyResult<_>>()?;
let mut sequence = Vec::with_capacity(normalizers.len());
for n in normalizers.iter() {
let normalizer: PyRef<PyNormalizer> = n.extract()?;
match &normalizer.normalizer {
PyNormalizerWrapper::Sequence(inner) => {
sequence.extend(inner.iter().map(|i| i.clone()))
}
PyNormalizerWrapper::Wrapped(inner) => sequence.push(inner.clone()),
}
}
Ok((
PySequence {},
PyNormalizer::new(Arc::new(Sequence::new(normalizers))),
PyNormalizer::new(PyNormalizerWrapper::Sequence(sequence)),
))
}
@@ -180,7 +186,7 @@ pub struct PyLowercase {}
impl PyLowercase {
#[new]
fn new() -> PyResult<(Self, PyNormalizer)> {
Ok((PyLowercase {}, PyNormalizer::new(Arc::new(Lowercase))))
Ok((PyLowercase {}, Lowercase.into()))
}
}
@@ -203,9 +209,114 @@ impl PyStrip {
}
}
Ok((
PyStrip {},
PyNormalizer::new(Arc::new(Strip::new(left, right))),
))
Ok((PyStrip {}, Strip::new(left, right).into()))
}
}
#[derive(Clone, Deserialize)]
#[serde(untagged)]
pub(crate) enum PyNormalizerWrapper {
Sequence(Vec<Arc<NormalizerWrapper>>),
Wrapped(Arc<NormalizerWrapper>),
}
impl Serialize for PyNormalizerWrapper {
fn serialize<S>(&self, serializer: S) -> Result<<S as Serializer>::Ok, <S as Serializer>::Error>
where
S: Serializer,
{
match self {
PyNormalizerWrapper::Sequence(seq) => {
let mut ser = serializer.serialize_struct("Sequence", 2)?;
ser.serialize_field("type", "Sequence")?;
ser.serialize_field("normalizers", seq)?;
ser.end()
}
PyNormalizerWrapper::Wrapped(inner) => inner.serialize(serializer),
}
}
}
impl<I> From<I> for PyNormalizerWrapper
where
I: Into<NormalizerWrapper>,
{
fn from(norm: I) -> Self {
PyNormalizerWrapper::Wrapped(Arc::new(norm.into()))
}
}
impl<I> From<I> for PyNormalizer
where
I: Into<NormalizerWrapper>,
{
fn from(norm: I) -> Self {
PyNormalizer {
normalizer: norm.into().into(),
}
}
}
impl Normalizer for PyNormalizerWrapper {
fn normalize(&self, normalized: &mut NormalizedString) -> tk::Result<()> {
match self {
PyNormalizerWrapper::Wrapped(inner) => inner.normalize(normalized),
PyNormalizerWrapper::Sequence(inner) => {
inner.iter().map(|n| n.normalize(normalized)).collect()
}
}
}
}
#[cfg(test)]
mod test {
use pyo3::{AsPyRef, Python};
use tk::normalizers::unicode::{NFC, NFKC};
use tk::normalizers::utils::Sequence;
use tk::normalizers::NormalizerWrapper;
use crate::normalizers::{PyNormalizer, PyNormalizerWrapper};
#[test]
fn get_subtype() {
let py_norm = PyNormalizer::new(NFC.into());
let py_nfc = py_norm.get_as_subtype().unwrap();
let gil = Python::acquire_gil();
assert_eq!(
"tokenizers.normalizers.NFC",
py_nfc.as_ref(gil.python()).get_type().name()
);
}
#[test]
fn serialize() {
let py_wrapped: PyNormalizerWrapper = NFKC.into();
let py_ser = serde_json::to_string(&py_wrapped).unwrap();
let rs_wrapped = NormalizerWrapper::NFKC(NFKC);
let rs_ser = serde_json::to_string(&rs_wrapped).unwrap();
assert_eq!(py_ser, rs_ser);
let py_norm: PyNormalizer = serde_json::from_str(&rs_ser).unwrap();
match py_norm.normalizer {
PyNormalizerWrapper::Wrapped(nfc) => match nfc.as_ref() {
NormalizerWrapper::NFKC(_) => {}
_ => panic!("Expected NFKC"),
},
_ => panic!("Expected wrapped, not sequence."),
}
let py_seq: PyNormalizerWrapper = Sequence::new(vec![NFC.into(), NFKC.into()]).into();
let py_wrapper_ser = serde_json::to_string(&py_seq).unwrap();
let rs_wrapped =
NormalizerWrapper::Sequence(Sequence::new(vec![NFC.into(), NFKC.into()]).into());
let rs_ser = serde_json::to_string(&rs_wrapped).unwrap();
assert_eq!(py_wrapper_ser, rs_ser);
let py_seq = PyNormalizer::new(py_seq);
let py_ser = serde_json::to_string(&py_seq).unwrap();
assert_eq!(py_wrapper_ser, py_ser);
let rs_seq = Sequence::new(vec![NFC.into(), NFKC.into()]);
let rs_ser = serde_json::to_string(&rs_seq).unwrap();
assert_eq!(py_wrapper_ser, rs_ser);
}
}

View File

@@ -10,6 +10,7 @@ use tk::pre_tokenizers::byte_level::ByteLevel;
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
use tk::pre_tokenizers::metaspace::Metaspace;
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
use tk::pre_tokenizers::PreTokenizerWrapper;
use tk::tokenizer::Offsets;
use tk::{PreTokenizedString, PreTokenizer};
use tokenizers as tk;
@@ -17,36 +18,48 @@ use tokenizers as tk;
use super::error::ToPyResult;
#[pyclass(dict, module = "tokenizers.pre_tokenizers", name=PreTokenizer)]
#[derive(Clone)]
#[derive(Clone, Serialize, Deserialize)]
pub struct PyPreTokenizer {
pub pretok: Arc<dyn PreTokenizer>,
#[serde(flatten)]
pub(crate) pretok: PyPreTokenizerWrapper,
}
impl PyPreTokenizer {
pub fn new(pretok: Arc<dyn PreTokenizer>) -> Self {
#[allow(dead_code)]
pub(crate) fn new(pretok: PyPreTokenizerWrapper) -> Self {
PyPreTokenizer { pretok }
}
}
impl Serialize for PyPreTokenizer {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.pretok.serialize(serializer)
pub(crate) fn get_as_subtype(&self) -> PyResult<PyObject> {
let base = self.clone();
let gil = Python::acquire_gil();
let py = gil.python();
match &self.pretok {
PyPreTokenizerWrapper::Custom(_) => Py::new(py, base).map(Into::into),
PyPreTokenizerWrapper::Wrapped(inner) => match inner.as_ref() {
PreTokenizerWrapper::Whitespace(_) => {
Py::new(py, (PyWhitespace {}, base)).map(Into::into)
}
PreTokenizerWrapper::Metaspace(_) => {
Py::new(py, (PyMetaspace {}, base)).map(Into::into)
}
PreTokenizerWrapper::Delimiter(_) => {
Py::new(py, (PyCharDelimiterSplit {}, base)).map(Into::into)
}
PreTokenizerWrapper::WhitespaceSplit(_) => {
Py::new(py, (PyWhitespaceSplit {}, base)).map(Into::into)
}
PreTokenizerWrapper::ByteLevel(_) => {
Py::new(py, (PyByteLevel {}, base)).map(Into::into)
}
PreTokenizerWrapper::BertPreTokenizer(_) => {
Py::new(py, (PyBertPreTokenizer {}, base)).map(Into::into)
}
},
}
}
}
impl<'de> Deserialize<'de> for PyPreTokenizer {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
Arc::deserialize(deserializer).map(PyPreTokenizer::new)
}
}
#[typetag::serde]
impl PreTokenizer for PyPreTokenizer {
fn pre_tokenize(&self, normalized: &mut PreTokenizedString) -> tk::Result<()> {
self.pretok.pre_tokenize(normalized)
@@ -64,7 +77,7 @@ impl PyPreTokenizer {
// }
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
let data = serde_json::to_string(&self.pretok.as_ref()).map_err(|e| {
let data = serde_json::to_string(&self.pretok).map_err(|e| {
exceptions::Exception::py_err(format!(
"Error while attempting to pickle PreTokenizer: {}",
e.to_string()
@@ -123,7 +136,7 @@ impl PyByteLevel {
}
}
Ok((PyByteLevel {}, PyPreTokenizer::new(Arc::new(byte_level))))
Ok((PyByteLevel {}, byte_level.into()))
}
#[staticmethod]
@@ -141,10 +154,7 @@ pub struct PyWhitespace {}
impl PyWhitespace {
#[new]
fn new() -> PyResult<(Self, PyPreTokenizer)> {
Ok((
PyWhitespace {},
PyPreTokenizer::new(Arc::new(Whitespace::default())),
))
Ok((PyWhitespace {}, Whitespace::default().into()))
}
}
@@ -154,10 +164,7 @@ pub struct PyWhitespaceSplit {}
impl PyWhitespaceSplit {
#[new]
fn new() -> PyResult<(Self, PyPreTokenizer)> {
Ok((
PyWhitespaceSplit {},
PyPreTokenizer::new(Arc::new(WhitespaceSplit)),
))
Ok((PyWhitespaceSplit {}, WhitespaceSplit.into()))
}
}
@@ -175,7 +182,7 @@ impl PyCharDelimiterSplit {
))?;
Ok((
PyCharDelimiterSplit {},
PyPreTokenizer::new(Arc::new(CharDelimiterSplit::new(chr_delimiter))),
CharDelimiterSplit::new(chr_delimiter).into(),
))
}
@@ -190,10 +197,7 @@ pub struct PyBertPreTokenizer {}
impl PyBertPreTokenizer {
#[new]
fn new() -> PyResult<(Self, PyPreTokenizer)> {
Ok((
PyBertPreTokenizer {},
PyPreTokenizer::new(Arc::new(BertPreTokenizer)),
))
Ok((PyBertPreTokenizer {}, BertPreTokenizer.into()))
}
}
@@ -225,24 +229,26 @@ impl PyMetaspace {
Ok((
PyMetaspace {},
PyPreTokenizer::new(Arc::new(Metaspace::new(replacement, add_prefix_space))),
Metaspace::new(replacement, add_prefix_space).into(),
))
}
}
// struct CustomPreTokenizer {
// class: PyObject,
// }
//
// impl CustomPreTokenizer {
// pub fn new(class: PyObject) -> PyResult<Self> {
// Ok(CustomPreTokenizer { class })
// }
// }
//
// #[typetag::serde]
// this is not accessible in python since the custom method is disabled.
#[allow(dead_code)]
pub(crate) struct CustomPreTokenizer {
class: PyObject,
}
impl CustomPreTokenizer {
#[allow(dead_code)]
pub fn new(class: PyObject) -> PyResult<Self> {
Ok(CustomPreTokenizer { class })
}
}
// impl tk::tokenizer::PreTokenizer for CustomPreTokenizer {
// fn pre_tokenize(&self, sentence: &mut NormalizedString) -> tk::Result<Vec<(String, Offsets)>> {
// fn pre_tokenize(&self, sentence: &mut PreTokenizedString) -> tk::Result<()> {
// let gil = Python::acquire_gil();
// let py = gil.python();
//
@@ -269,22 +275,104 @@ impl PyMetaspace {
// }
// }
//
// impl Serialize for CustomPreTokenizer {
// fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
// where
// S: Serializer,
// {
// Err(serde::ser::Error::custom(
// "Custom PyPreTokenizer cannot be serialized",
// ))
// }
// }
//
// impl<'de> Deserialize<'de> for CustomPreTokenizer {
// fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
// where
// D: Deserializer<'de>,
// {
// Err(D::Error::custom("PyDecoder cannot be deserialized"))
// }
// }
impl Serialize for CustomPreTokenizer {
fn serialize<S>(&self, _serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
Err(serde::ser::Error::custom(
"Custom PyPreTokenizer cannot be serialized",
))
}
}
impl<'de> Deserialize<'de> for CustomPreTokenizer {
fn deserialize<D>(_deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
Err(serde::de::Error::custom("PyDecoder cannot be deserialized"))
}
}
#[derive(Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub(crate) enum PyPreTokenizerWrapper {
Custom(Arc<CustomPreTokenizer>),
Wrapped(Arc<PreTokenizerWrapper>),
}
impl<I> From<I> for PyPreTokenizerWrapper
where
I: Into<PreTokenizerWrapper>,
{
fn from(norm: I) -> Self {
PyPreTokenizerWrapper::Wrapped(Arc::new(norm.into()))
}
}
impl<I> From<I> for PyPreTokenizer
where
I: Into<PreTokenizerWrapper>,
{
fn from(pretok: I) -> Self {
PyPreTokenizer {
pretok: pretok.into().into(),
}
}
}
impl PreTokenizer for PyPreTokenizerWrapper {
fn pre_tokenize(&self, normalized: &mut PreTokenizedString) -> tk::Result<()> {
match self {
PyPreTokenizerWrapper::Wrapped(inner) => inner.pre_tokenize(normalized),
PyPreTokenizerWrapper::Custom(_) => {
unreachable!("Custom pretokenizers are currently disabled, how did you get here?")
}
}
}
}
#[cfg(test)]
mod test {
use pyo3::{AsPyRef, Py, PyObject, Python};
use tk::pre_tokenizers::whitespace::Whitespace;
use tk::pre_tokenizers::PreTokenizerWrapper;
use crate::pre_tokenizers::{CustomPreTokenizer, PyPreTokenizer, PyPreTokenizerWrapper};
use std::sync::Arc;
#[test]
fn get_subtype() {
let py_norm = PyPreTokenizer::new(Whitespace::default().into());
let py_wsp = py_norm.get_as_subtype().unwrap();
let gil = Python::acquire_gil();
assert_eq!(
"tokenizers.pre_tokenizers.Whitespace",
py_wsp.as_ref(gil.python()).get_type().name()
);
}
#[test]
fn serialize() {
let py_wrapped: PyPreTokenizerWrapper = Whitespace::default().into();
let py_ser = serde_json::to_string(&py_wrapped).unwrap();
let rs_wrapped = PreTokenizerWrapper::Whitespace(Whitespace::default());
let rs_ser = serde_json::to_string(&rs_wrapped).unwrap();
assert_eq!(py_ser, rs_ser);
let py_pretok: PyPreTokenizer = serde_json::from_str(&rs_ser).unwrap();
match py_pretok.pretok {
PyPreTokenizerWrapper::Wrapped(wsp) => match wsp.as_ref() {
PreTokenizerWrapper::Whitespace(_) => {}
_ => panic!("Expected Whitespace"),
},
_ => panic!("Expected wrapped, not custom."),
}
let gil = Python::acquire_gil();
let py_wsp = PyPreTokenizer::new(Whitespace::default().into());
let obj: PyObject = Py::new(gil.python(), py_wsp).unwrap().into();
let py_seq: PyPreTokenizerWrapper =
PyPreTokenizerWrapper::Custom(Arc::new(CustomPreTokenizer::new(obj).unwrap()));
assert!(serde_json::to_string(&py_seq).is_err());
}
}

View File

@@ -4,26 +4,44 @@ use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde::{Deserialize, Serialize};
use tk::processors::bert::BertProcessing;
use tk::processors::byte_level::ByteLevel;
use tk::processors::roberta::RobertaProcessing;
use tk::processors::PostProcessorWrapper;
use tk::{Encoding, PostProcessor};
use tokenizers as tk;
#[pyclass(dict, module = "tokenizers.processors", name=PostProcessor)]
#[derive(Clone)]
#[derive(Clone, Deserialize, Serialize)]
pub struct PyPostProcessor {
pub processor: Arc<dyn PostProcessor>,
#[serde(flatten)]
pub processor: Arc<PostProcessorWrapper>,
}
impl PyPostProcessor {
pub fn new(processor: Arc<dyn PostProcessor>) -> Self {
pub fn new(processor: Arc<PostProcessorWrapper>) -> Self {
PyPostProcessor { processor }
}
pub(crate) fn get_as_subtype(&self) -> PyResult<PyObject> {
let base = self.clone();
let gil = Python::acquire_gil();
let py = gil.python();
match self.processor.as_ref() {
PostProcessorWrapper::ByteLevel(_) => {
Py::new(py, (PyByteLevel {}, base)).map(Into::into)
}
PostProcessorWrapper::Bert(_) => {
Py::new(py, (PyBertProcessing {}, base)).map(Into::into)
}
PostProcessorWrapper::Roberta(_) => {
Py::new(py, (PyRobertaProcessing {}, base)).map(Into::into)
}
}
}
}
#[typetag::serde]
impl PostProcessor for PyPostProcessor {
fn added_tokens(&self, is_pair: bool) -> usize {
self.processor.added_tokens(is_pair)
@@ -40,24 +58,6 @@ impl PostProcessor for PyPostProcessor {
}
}
impl Serialize for PyPostProcessor {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
self.processor.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for PyPostProcessor {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: Deserializer<'de>,
{
Ok(PyPostProcessor::new(Arc::deserialize(deserializer)?))
}
}
#[pymethods]
impl PyPostProcessor {
fn __getstate__(&self, py: Python) -> PyResult<PyObject> {
@@ -98,7 +98,7 @@ impl PyBertProcessing {
fn new(sep: (String, u32), cls: (String, u32)) -> PyResult<(Self, PyPostProcessor)> {
Ok((
PyBertProcessing {},
PyPostProcessor::new(Arc::new(BertProcessing::new(sep, cls))),
PyPostProcessor::new(Arc::new(BertProcessing::new(sep, cls).into())),
))
}
@@ -122,7 +122,10 @@ impl PyRobertaProcessing {
let proc = RobertaProcessing::new(sep, cls)
.trim_offsets(trim_offsets)
.add_prefix_space(add_prefix_space);
Ok((PyRobertaProcessing {}, PyPostProcessor::new(Arc::new(proc))))
Ok((
PyRobertaProcessing {},
PyPostProcessor::new(Arc::new(proc.into())),
))
}
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
@@ -148,6 +151,58 @@ impl PyByteLevel {
}
}
}
Ok((PyByteLevel {}, PyPostProcessor::new(Arc::new(byte_level))))
Ok((
PyByteLevel {},
PyPostProcessor::new(Arc::new(byte_level.into())),
))
}
}
#[cfg(test)]
mod test {
use std::sync::Arc;
use pyo3::{AsPyRef, Python};
use tk::processors::bert::BertProcessing;
use tk::processors::PostProcessorWrapper;
use crate::processors::PyPostProcessor;
#[test]
fn get_subtype() {
let py_proc = PyPostProcessor::new(Arc::new(
BertProcessing::new(("SEP".into(), 0), ("CLS".into(), 1)).into(),
));
let py_bert = py_proc.get_as_subtype().unwrap();
let gil = Python::acquire_gil();
assert_eq!(
"tokenizers.processors.BertProcessing",
py_bert.as_ref(gil.python()).get_type().name()
);
}
#[test]
fn serialize() {
let rs_processing = BertProcessing::new(("SEP".into(), 0), ("CLS".into(), 1));
let rs_wrapper: PostProcessorWrapper = rs_processing.clone().into();
let rs_processing_ser = serde_json::to_string(&rs_processing).unwrap();
let rs_wrapper_ser = serde_json::to_string(&rs_wrapper).unwrap();
let py_processing = PyPostProcessor::new(Arc::new(rs_wrapper.clone()));
let py_ser = serde_json::to_string(&py_processing).unwrap();
assert_eq!(py_ser, rs_processing_ser);
assert_eq!(py_ser, rs_wrapper_ser);
let py_processing: PyPostProcessor = serde_json::from_str(&rs_processing_ser).unwrap();
match py_processing.processor.as_ref() {
PostProcessorWrapper::Bert(_) => (),
_ => panic!("Expected Bert postprocessor."),
}
let py_processing: PyPostProcessor = serde_json::from_str(&rs_wrapper_ser).unwrap();
match py_processing.processor.as_ref() {
PostProcessorWrapper::Bert(_) => (),
_ => panic!("Expected Bert postprocessor."),
}
}
}

View File

@@ -7,8 +7,8 @@ use pyo3::types::*;
use pyo3::PyObjectProtocol;
use tk::models::bpe::BPE;
use tk::tokenizer::{
PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl, TruncationParams,
TruncationStrategy,
PaddingDirection, PaddingParams, PaddingStrategy, PostProcessor, TokenizerImpl,
TruncationParams, TruncationStrategy,
};
use tokenizers as tk;
@@ -695,8 +695,8 @@ impl PyTokenizer {
}
#[getter]
fn get_model(&self) -> PyModel {
self.tokenizer.get_model().clone()
fn get_model(&self) -> PyResult<PyObject> {
self.tokenizer.get_model().get_as_subtype()
}
#[setter]
@@ -705,8 +705,12 @@ impl PyTokenizer {
}
#[getter]
fn get_normalizer(&self) -> Option<PyNormalizer> {
self.tokenizer.get_normalizer().cloned()
fn get_normalizer(&self) -> PyResult<PyObject> {
if let Some(n) = self.tokenizer.get_normalizer() {
n.get_as_subtype()
} else {
Ok(Python::acquire_gil().python().None())
}
}
#[setter]
@@ -715,8 +719,12 @@ impl PyTokenizer {
}
#[getter]
fn get_pre_tokenizer(&self) -> Option<PyPreTokenizer> {
self.tokenizer.get_pre_tokenizer().cloned()
fn get_pre_tokenizer(&self) -> PyResult<PyObject> {
if let Some(pt) = self.tokenizer.get_pre_tokenizer() {
pt.get_as_subtype()
} else {
Ok(Python::acquire_gil().python().None())
}
}
#[setter]
@@ -725,8 +733,12 @@ impl PyTokenizer {
}
#[getter]
fn get_post_processor(&self) -> Option<PyPostProcessor> {
self.tokenizer.get_post_processor().cloned()
fn get_post_processor(&self) -> PyResult<PyObject> {
if let Some(n) = self.tokenizer.get_post_processor() {
n.get_as_subtype()
} else {
Ok(Python::acquire_gil().python().None())
}
}
#[setter]
@@ -735,8 +747,12 @@ impl PyTokenizer {
}
#[getter]
fn get_decoder(&self) -> Option<PyDecoder> {
self.tokenizer.get_decoder().cloned()
fn get_decoder(&self) -> PyResult<PyObject> {
if let Some(dec) = self.tokenizer.get_decoder() {
dec.get_as_subtype()
} else {
Ok(Python::acquire_gil().python().None())
}
}
#[setter]