diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 13cea2d5..20c4fc4f 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -4,12 +4,16 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.11.0] ### Added - [#657]: Add SplitDelimiterBehavior customization to Punctuation constructor ### Changed +- [#718]: Fix `WordLevel` tokenizer determinism during training +- [#762]: Add a way to specify the unknown token in `SentencePieceUnigramTokenizer` +- [#770]: Improved documentation for `UnigramTrainer` +- [#780]: Add `Tokenizer.from_pretrained` to load tokenizers from the Hugging Face Hub - [#793]: Saving a pretty JSON file by default when saving a tokenizer ## [0.10.3] @@ -330,6 +334,10 @@ delimiter (Works like `.split(delimiter)`) [#793]: https://github.com/huggingface/tokenizers/pull/793 +[#780]: https://github.com/huggingface/tokenizers/pull/780 +[#770]: https://github.com/huggingface/tokenizers/pull/770 +[#762]: https://github.com/huggingface/tokenizers/pull/762 +[#718]: https://github.com/huggingface/tokenizers/pull/718 [#714]: https://github.com/huggingface/tokenizers/pull/714 [#707]: https://github.com/huggingface/tokenizers/pull/707 [#693]: https://github.com/huggingface/tokenizers/pull/693 diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 93885742..4f7aee56 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -63,9 +63,9 @@ checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" [[package]] name = "bitflags" -version = "1.2.1" +version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitvec" @@ -102,9 +102,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" [[package]] name = "bytes" -version = "1.0.1" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" +checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" [[package]] name = "bzip2" @@ -152,9 +152,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.69" +version = "1.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e70cc2f62c6ce1868963827bd677764c62d07c3d9a3e1fb1177ee1a9ab199eb2" +checksum = "d26a6ce4b6a484fa3edb70f7efa6fc430fd2b87285fe8b84304fd0936faa0dc0" [[package]] name = "cfg-if" @@ -216,9 +216,9 @@ checksum = "ea221b5284a47e40033bf9b66f35f984ec0ea2931eb03505246cd27a963f981b" [[package]] name = "cpufeatures" -version = "0.1.5" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66c99696f6c9dd7f35d486b9d04d7e6e202aa3e8c40d553f2fdf5e7e0c6a71ef" +checksum = "95059428f66df56b63431fdb4e1947ed2190586af5c5a8a8b71122bdf5a7f469" dependencies = [ "libc", ] @@ -244,9 +244,9 @@ dependencies = [ [[package]] name = "crossbeam-deque" -version = "0.8.0" +version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" +checksum = "6455c0ca19f0d2fbf751b908d5c55c1f5cbc65e03c4225427254b46890bdde1e" dependencies = [ "cfg-if 1.0.0", "crossbeam-epoch", @@ -278,9 +278,9 @@ dependencies = [ [[package]] name = "ctor" -version = "0.1.20" +version = "0.1.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e98e2ad1a782e33928b96fc3948e7c355e5af34ba4de7670fe8bac2a3b2006d" +checksum = "ccc0a48a9b826acdf4028595adc9db92caea352f7af011a3034acd172a52a0aa" dependencies = [ "quote", "syn", @@ -411,9 +411,9 @@ dependencies = [ [[package]] name = "esaxx-rs" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "138c07ec623d898d1cfa74d7f3608be24070e1989e5e6c7e8d9145eba61b1dc7" +checksum = "6f4617b351b734e97a3dd32022a721471349aa3038d4132beee8568cdfa7e716" dependencies = [ "cc", ] @@ -432,9 +432,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.20" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd3aec53de10fe96d7d8c565eb17f2c687bb5518a2ec453b5b1252964526abe0" +checksum = "80edafed416a46fb378521624fab1cfa2eb514784fd8921adbe8a8d8321da811" dependencies = [ "cfg-if 1.0.0", "crc32fast", @@ -491,42 +491,42 @@ checksum = "fed34cd105917e91daa4da6b3728c47b068749d6a62c59811f06ed2ac71d9da7" [[package]] name = "futures-channel" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74ed2411805f6e4e3d9bc904c95d5d423b89b3b25dc0250aa74729de20629ff9" +checksum = "5da6ba8c3bb3c165d3c7319fc1cc8304facf1fb8db99c5de877183c08a273888" dependencies = [ "futures-core", ] [[package]] name = "futures-core" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af51b1b4a7fdff033703db39de8802c673eb91855f2e0d47dcf3bf2c0ef01f99" +checksum = "88d1c26957f23603395cd326b0ffe64124b818f4449552f960d815cfba83a53d" [[package]] name = "futures-io" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b0e06c393068f3a6ef246c75cdca793d6a46347e75286933e5e75fd2fd11582" +checksum = "522de2a0fe3e380f1bc577ba0474108faf3f6b18321dbf60b3b9c39a75073377" [[package]] name = "futures-sink" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0f30aaa67363d119812743aa5f33c201a7a66329f97d1a887022971feea4b53" +checksum = "36ea153c13024fe480590b3e3d4cad89a0cfacecc24577b68f86c6ced9c2bc11" [[package]] name = "futures-task" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbe54a98670017f3be909561f6ad13e810d9a51f3f061b902062ca3da80799f2" +checksum = "1d3d00f4eddb73e498a54394f228cd55853bdf059259e8e7bc6e69d408892e99" [[package]] name = "futures-util" -version = "0.3.16" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67eb846bfd58e44a8481a00049e82c43e0ccb5d61f8dc071057cb19249dd4d78" +checksum = "36568465210a3a6ee45e1f165136d68671471a501e632e9a98d96872222b5481" dependencies = [ "autocfg", "futures-core", @@ -589,9 +589,9 @@ checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" [[package]] name = "h2" -version = "0.3.3" +version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "825343c4eef0b63f541f8903f395dc5beb362a979b5799a84062527ef1e37726" +checksum = "d7f3675cfef6a30c8031cf9e6493ebdc3bb3272a3fea3923c4210d1830e6a472" dependencies = [ "bytes", "fnv", @@ -645,9 +645,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.4.1" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68" +checksum = "acd94fdbe1d4ff688b67b04eee2e17bd50995534a61539e45adfefb45e5e5503" [[package]] name = "httpdate" @@ -666,9 +666,9 @@ dependencies = [ [[package]] name = "hyper" -version = "0.14.11" +version = "0.14.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b61cf2d1aebcf6e6352c97b81dc2244ca29194be1b276f5d8ad5c6330fffb11" +checksum = "13f67199e765030fa08fe0bd581af683f0d5bc04ea09c2b1102012c5fb90e7fd" dependencies = [ "bytes", "futures-channel", @@ -820,15 +820,15 @@ dependencies = [ [[package]] name = "itoa" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" [[package]] name = "js-sys" -version = "0.3.52" +version = "0.3.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce791b7ca6638aae45be056e068fc756d871eb3b3b10b8efa62d1c9cec616752" +checksum = "e4bf49d50e2961077d9c99f4b7997d770a1114f087c3c2e0069b36c13fc2979d" dependencies = [ "wasm-bindgen", ] @@ -854,15 +854,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.98" +version = "0.2.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" +checksum = "3cb00336871be5ed2c8ed44b60ae9959dc5b9f08539422ed43f09e34ecaeba21" [[package]] name = "lock_api" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0382880606dff6d15c9476c416d18690b72742aa7b605bb6dd6ec9030fbf07eb" +checksum = "712a4d093c9976e24e7dbca41db895dabcbac38eb5f4045393d17a95bdfb1109" dependencies = [ "scopeguard", ] @@ -1125,9 +1125,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d7744ac029df22dca6284efe4e898991d28e3085c706c972bcd7da4a27a15eb" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" dependencies = [ "instant", "lock_api", @@ -1136,9 +1136,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.8.3" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa7a782938e745763fe6907fc6ba86946d72f49fe7e21de074e08128a99fb018" +checksum = "d76e8e1493bcac0d2766c42737f34458f1c8c50c0d23bcb24ea953affb273216" dependencies = [ "cfg-if 1.0.0", "instant", @@ -1205,9 +1205,9 @@ checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" [[package]] name = "proc-macro2" -version = "1.0.27" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0d8caf72986c1a598726adc988bb5984792ef84f5ee5aa50209145ee8077038" +checksum = "b9f5105d4fdaab20335ca9565e106a5d9b82b6219b5ba735731124ac6711d23d" dependencies = [ "unicode-xid", ] @@ -1396,9 +1396,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.2.9" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ab49abadf3f9e1c4bc499e8845e152ad87d2ad2d30371841171169e9d75feee" +checksum = "8383f39639269cde97d255a32bdb68c047337295414940c68bdd30c2e13203ff" dependencies = [ "bitflags", ] @@ -1497,9 +1497,9 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "security-framework" -version = "2.3.1" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23a2ac85147a3a11d77ecf1bc7166ec0b92febfa4461c37944e180f319ece467" +checksum = "525bc1abfda2e1998d152c45cf13e696f76d0a4972310b22fac1658b05df7c87" dependencies = [ "bitflags", "core-foundation", @@ -1510,9 +1510,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.3.0" +version = "2.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e4effb91b4b8b6fb7732e670b6cee160278ff8e6bf485c7805d9e319d76e284" +checksum = "a9dd14d83160b528b7bfd66439110573efcfbe281b17fc2ca9f39f550d619c7e" dependencies = [ "core-foundation-sys", "libc", @@ -1520,18 +1520,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.126" +version = "1.0.130" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03" +checksum = "f12d06de37cf59146fbdecab66aa99f9fe4f78722e3607577a5375d66bd0c913" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.126" +version = "1.0.130" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43" +checksum = "d7bc1a1ab1961464eae040d96713baa5a724a8152c1222492465b54322ec508b" dependencies = [ "proc-macro2", "quote", @@ -1540,9 +1540,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.64" +version = "1.0.67" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "799e97dc9fdae36a5c8b8f2cae9ce2ee9fdce2058c57a93e6099d919fd982f79" +checksum = "a7f9e390c27c3c0ce8bc5d725f6e4d30a29d26659494aa4b17535f7522c5c950" dependencies = [ "itoa", "ryu", @@ -1563,9 +1563,9 @@ dependencies = [ [[package]] name = "sha2" -version = "0.9.5" +version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b362ae5752fd2137731f9fa25fd4d9058af34666ca1966fb969119cc35719f12" +checksum = "9204c41a1597a8c5af23c82d1c921cb01ec0a4c59e07a9c7306062829a3903f3" dependencies = [ "block-buffer", "cfg-if 1.0.0", @@ -1628,9 +1628,9 @@ checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c" [[package]] name = "syn" -version = "1.0.73" +version = "1.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f71489ff30030d2ae598524f61326b902466f72a0fb1a8564c001cc63425bcc7" +checksum = "c6f107db402c2c2055242dbf4d2af0e69197202e9faacbef9571bbe47f5a1b84" dependencies = [ "proc-macro2", "quote", @@ -1698,18 +1698,18 @@ dependencies = [ [[package]] name = "thiserror" -version = "1.0.26" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93119e4feac1cbe6c798c34d3a53ea0026b0b1de6a120deef895137c0529bfe2" +checksum = "602eca064b2d83369e2b2f34b09c70b605402801927c65c11071ac911d299b88" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.26" +version = "1.0.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "060d69a0afe7796bf42e9e2ff91f5ee691fb15c53d38b4b62a9a53eb23164745" +checksum = "bad553cc2c78e8de258400763a647e80e6d1b31ee237275d756f6836d204494c" dependencies = [ "proc-macro2", "quote", @@ -1771,7 +1771,7 @@ dependencies = [ [[package]] name = "tokenizers-python" -version = "0.10.3" +version = "0.11.0" dependencies = [ "env_logger", "itertools 0.9.0", @@ -1789,9 +1789,9 @@ dependencies = [ [[package]] name = "tokio" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cf844b23c6131f624accf65ce0e4e9956a8bb329400ea5bcc26ae3a5c20b0b" +checksum = "b4efe6fc2395938c8155973d7be49fe8d03a843726e285e100a8a383cc0154ce" dependencies = [ "autocfg", "bytes", @@ -1815,9 +1815,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.6.7" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1caa0b0c8d94a049db56b5acf8cba99dc0623aab1b26d5b5f5e2d945846b3592" +checksum = "08d3725d3efa29485e87311c5b699de63cde14b00ed4d256b8318aa30ca452cd" dependencies = [ "bytes", "futures-core", @@ -1861,9 +1861,9 @@ checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" [[package]] name = "typenum" -version = "1.13.0" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879f6906492a7cd215bfa4cf595b600146ccfac0c79bcbd1f3000162af5e8b06" +checksum = "b63708a265f51345575b27fe43f9500ad611579e764c79edbc2037b1121959ec" [[package]] name = "unicode-bidi" @@ -1973,9 +1973,9 @@ checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasm-bindgen" -version = "0.2.75" +version = "0.2.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b608ecc8f4198fe8680e2ed18eccab5f0cd4caaf3d83516fa5fb2e927fda2586" +checksum = "8ce9b1b516211d33767048e5d47fa2a381ed8b76fc48d2ce4aa39877f9f183e0" dependencies = [ "cfg-if 1.0.0", "serde", @@ -1985,9 +1985,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.75" +version = "0.2.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "580aa3a91a63d23aac5b6b267e2d13cb4f363e31dce6c352fca4752ae12e479f" +checksum = "cfe8dc78e2326ba5f845f4b5bf548401604fa20b1dd1d365fb73b6c1d6364041" dependencies = [ "bumpalo", "lazy_static", @@ -2000,9 +2000,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.25" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16646b21c3add8e13fdb8f20172f8a28c3dbf62f45406bcff0233188226cfe0c" +checksum = "95fded345a6559c2cfee778d562300c581f7d4ff3edb9b0d230d69800d213972" dependencies = [ "cfg-if 1.0.0", "js-sys", @@ -2012,9 +2012,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.75" +version = "0.2.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "171ebf0ed9e1458810dfcb31f2e766ad6b3a89dbda42d8901f2b268277e5f09c" +checksum = "44468aa53335841d9d6b6c023eaab07c0cd4bddbcfdee3e2bb1e8d2cb8069fef" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2022,9 +2022,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.75" +version = "0.2.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c2657dd393f03aa2a659c25c6ae18a13a4048cebd220e147933ea837efc589f" +checksum = "0195807922713af1e67dc66132c7328206ed9766af3858164fb583eedc25fbad" dependencies = [ "proc-macro2", "quote", @@ -2035,15 +2035,15 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.75" +version = "0.2.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e0c4a743a309662d45f4ede961d7afa4ba4131a59a639f29b0069c3798bbcc2" +checksum = "acdb075a845574a1fa5f09fd77e43f7747599301ea3417a9fbffdeedfc1f4a29" [[package]] name = "web-sys" -version = "0.3.52" +version = "0.3.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01c70a82d842c9979078c772d4a1344685045f1a5628f677c2b2eab4dd7d2696" +checksum = "224b2f6b67919060055ef1a67807367c2066ed520c3862cc013d26cf893a783c" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 72a17e47..7ac0dd9a 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tokenizers-python" -version = "0.10.3" +version = "0.11.0" authors = ["Anthony MOI "] edition = "2018" diff --git a/bindings/python/py_src/tokenizers/__init__.py b/bindings/python/py_src/tokenizers/__init__.py index efd3628d..27b1b313 100644 --- a/bindings/python/py_src/tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.10.3" +__version__ = "0.11.0" from typing import Tuple, Union, Tuple, List from enum import Enum diff --git a/bindings/python/setup.py b/bindings/python/setup.py index f764c5e9..8433a4f4 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -7,7 +7,7 @@ extras["docs"] = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"] setup( name="tokenizers", - version="0.10.3", + version="0.11.0", description="Fast and Customizable Tokenizers", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown",