mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
[Fix #17] BPE & WordPiece models saving
This commit is contained in:
74
bindings/python/Cargo.lock
generated
74
bindings/python/Cargo.lock
generated
@ -35,6 +35,14 @@ name = "bitflags"
|
|||||||
version = "1.2.1"
|
version = "1.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "c2-chacha"
|
||||||
|
version = "0.2.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cfg-if"
|
name = "cfg-if"
|
||||||
version = "0.1.10"
|
version = "0.1.10"
|
||||||
@ -109,6 +117,16 @@ name = "either"
|
|||||||
version = "1.5.3"
|
version = "1.5.3"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "getrandom"
|
||||||
|
version = "0.1.13"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"wasi 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ghost"
|
name = "ghost"
|
||||||
version = "0.1.1"
|
version = "0.1.1"
|
||||||
@ -233,6 +251,11 @@ dependencies = [
|
|||||||
"syn 1.0.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
"syn 1.0.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ppv-lite86"
|
||||||
|
version = "0.2.6"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro-hack"
|
name = "proc-macro-hack"
|
||||||
version = "0.5.11"
|
version = "0.5.11"
|
||||||
@ -300,6 +323,43 @@ dependencies = [
|
|||||||
"proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
"proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand"
|
||||||
|
version = "0.7.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_chacha 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_hc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_chacha"
|
||||||
|
version = "0.2.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"c2-chacha 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_core"
|
||||||
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"getrandom 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "rand_hc"
|
||||||
|
version = "0.2.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "rayon"
|
name = "rayon"
|
||||||
version = "1.3.0"
|
version = "1.3.0"
|
||||||
@ -452,6 +512,7 @@ version = "0.0.11"
|
|||||||
dependencies = [
|
dependencies = [
|
||||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
"regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
@ -498,6 +559,11 @@ name = "version_check"
|
|||||||
version = "0.9.1"
|
version = "0.9.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wasi"
|
||||||
|
version = "0.7.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "winapi"
|
name = "winapi"
|
||||||
version = "0.3.8"
|
version = "0.3.8"
|
||||||
@ -523,6 +589,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)" = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90"
|
"checksum atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)" = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90"
|
||||||
"checksum autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2"
|
"checksum autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2"
|
||||||
"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
|
||||||
|
"checksum c2-chacha 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "214238caa1bf3a496ec3392968969cab8549f96ff30652c9e56885329315f6bb"
|
||||||
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
|
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
|
||||||
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
|
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
|
||||||
"checksum crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c3aa945d63861bfe624b55d153a39684da1e8c0bc8fba932f7ee3a3c16cea3ca"
|
"checksum crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c3aa945d63861bfe624b55d153a39684da1e8c0bc8fba932f7ee3a3c16cea3ca"
|
||||||
@ -531,6 +598,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ce446db02cdc3165b94ae73111e570793400d0794e46125cc4056c81cbb039f4"
|
"checksum crossbeam-utils 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ce446db02cdc3165b94ae73111e570793400d0794e46125cc4056c81cbb039f4"
|
||||||
"checksum ctor 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "cd8ce37ad4184ab2ce004c33bf6379185d3b1c95801cab51026bd271bf68eedc"
|
"checksum ctor 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "cd8ce37ad4184ab2ce004c33bf6379185d3b1c95801cab51026bd271bf68eedc"
|
||||||
"checksum either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
|
"checksum either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "bb1f6b1ce1c140482ea30ddd3335fc0024ac7ee112895426e0a629a6c20adfe3"
|
||||||
|
"checksum getrandom 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "e7db7ca94ed4cd01190ceee0d8a8052f08a247aa1b469a7f68c6a3b71afcf407"
|
||||||
"checksum ghost 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2a36606a68532b5640dc86bb1f33c64b45c4682aad4c50f3937b317ea387f3d6"
|
"checksum ghost 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "2a36606a68532b5640dc86bb1f33c64b45c4682aad4c50f3937b317ea387f3d6"
|
||||||
"checksum hermit-abi 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f629dc602392d3ec14bfc8a09b5e644d7ffd725102b48b81e59f90f2633621d7"
|
"checksum hermit-abi 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "f629dc602392d3ec14bfc8a09b5e644d7ffd725102b48b81e59f90f2633621d7"
|
||||||
"checksum indoc 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3f9553c1e16c114b8b77ebeb329e5f2876eed62a8d51178c8bc6bff0d65f98f8"
|
"checksum indoc 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "3f9553c1e16c114b8b77ebeb329e5f2876eed62a8d51178c8bc6bff0d65f98f8"
|
||||||
@ -546,12 +614,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum num_cpus 1.11.1 (registry+https://github.com/rust-lang/crates.io-index)" = "76dac5ed2a876980778b8b85f75a71b6cbf0db0b1232ee12f826bccb00d09d72"
|
"checksum num_cpus 1.11.1 (registry+https://github.com/rust-lang/crates.io-index)" = "76dac5ed2a876980778b8b85f75a71b6cbf0db0b1232ee12f826bccb00d09d72"
|
||||||
"checksum paste 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "423a519e1c6e828f1e73b720f9d9ed2fa643dce8a7737fb43235ce0b41eeaa49"
|
"checksum paste 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "423a519e1c6e828f1e73b720f9d9ed2fa643dce8a7737fb43235ce0b41eeaa49"
|
||||||
"checksum paste-impl 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "4214c9e912ef61bf42b81ba9a47e8aad1b2ffaf739ab162bf96d1e011f54e6c5"
|
"checksum paste-impl 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "4214c9e912ef61bf42b81ba9a47e8aad1b2ffaf739ab162bf96d1e011f54e6c5"
|
||||||
|
"checksum ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "74490b50b9fbe561ac330df47c08f3f33073d2d00c150f719147d7c54522fa1b"
|
||||||
"checksum proc-macro-hack 0.5.11 (registry+https://github.com/rust-lang/crates.io-index)" = "ecd45702f76d6d3c75a80564378ae228a85f0b59d2f3ed43c91b4a69eb2ebfc5"
|
"checksum proc-macro-hack 0.5.11 (registry+https://github.com/rust-lang/crates.io-index)" = "ecd45702f76d6d3c75a80564378ae228a85f0b59d2f3ed43c91b4a69eb2ebfc5"
|
||||||
"checksum proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9c9e470a8dc4aeae2dee2f335e8f533e2d4b347e1434e5671afc49b054592f27"
|
"checksum proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9c9e470a8dc4aeae2dee2f335e8f533e2d4b347e1434e5671afc49b054592f27"
|
||||||
"checksum pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7f9df1468dddf8a59ec799cf3b930bb75ec09deabe875ba953e06c51d1077136"
|
"checksum pyo3 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "7f9df1468dddf8a59ec799cf3b930bb75ec09deabe875ba953e06c51d1077136"
|
||||||
"checksum pyo3-derive-backend 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "9f6e56fb3e97b344a8f87d036f94578399402c6b75949de6270cd07928f790b1"
|
"checksum pyo3-derive-backend 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "9f6e56fb3e97b344a8f87d036f94578399402c6b75949de6270cd07928f790b1"
|
||||||
"checksum pyo3cls 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "97452dcdf5941627ebc5c06664a07821fc7fc88d7515f02178193a8ebe316468"
|
"checksum pyo3cls 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "97452dcdf5941627ebc5c06664a07821fc7fc88d7515f02178193a8ebe316468"
|
||||||
"checksum quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "053a8c8bcc71fcce321828dc897a98ab9760bef03a4fc36693c231e5b3216cfe"
|
"checksum quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "053a8c8bcc71fcce321828dc897a98ab9760bef03a4fc36693c231e5b3216cfe"
|
||||||
|
"checksum rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "3ae1b169243eaf61759b8475a998f0a385e42042370f3a7dbaf35246eacc8412"
|
||||||
|
"checksum rand_chacha 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "03a2a90da8c7523f554344f921aa97283eadf6ac484a6d2a7d0212fa7f8d6853"
|
||||||
|
"checksum rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
|
||||||
|
"checksum rand_hc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
|
||||||
"checksum rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098"
|
"checksum rayon 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "db6ce3297f9c85e16621bb8cca38a06779ffc31bb8184e1be4bed2be4678a098"
|
||||||
"checksum rayon-core 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9"
|
"checksum rayon-core 1.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "08a89b46efaf957e52b18062fb2f4660f8b8a4dde1807ca002690868ef2c85a9"
|
||||||
"checksum regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dc220bd33bdce8f093101afe22a037b8eb0e5af33592e6a9caafff0d4cb81cbd"
|
"checksum regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dc220bd33bdce8f093101afe22a037b8eb0e5af33592e6a9caafff0d4cb81cbd"
|
||||||
@ -577,6 +650,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum unindent 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "63f18aa3b0e35fed5a0048f029558b1518095ffe2a0a31fb87c93dece93a4993"
|
"checksum unindent 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "63f18aa3b0e35fed5a0048f029558b1518095ffe2a0a31fb87c93dece93a4993"
|
||||||
"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a"
|
"checksum vec_map 0.8.1 (registry+https://github.com/rust-lang/crates.io-index)" = "05c78687fb1a80548ae3250346c3db86a80a7cdd77bda190189f2d0a0987c81a"
|
||||||
"checksum version_check 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)" = "078775d0255232fb988e6fccf26ddc9d1ac274299aaedcedce21c6f72cc533ce"
|
"checksum version_check 0.9.1 (registry+https://github.com/rust-lang/crates.io-index)" = "078775d0255232fb988e6fccf26ddc9d1ac274299aaedcedce21c6f72cc533ce"
|
||||||
|
"checksum wasi 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b89c3ce4ce14bdc6fb6beaf9ec7928ca331de5df7e5ea278375642a2f478570d"
|
||||||
"checksum winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
|
"checksum winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)" = "8093091eeb260906a183e6ae1abdba2ef5ef2257a21801128899c3fc699229c6"
|
||||||
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
"checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
|
||||||
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
"checksum winapi-x86_64-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
extern crate tokenizers as tk;
|
extern crate tokenizers as tk;
|
||||||
|
|
||||||
|
use super::error::ToPyResult;
|
||||||
use super::utils::Container;
|
use super::utils::Container;
|
||||||
|
|
||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
|
||||||
/// A Model represents some tokenization algorithm like BPE or Word
|
/// A Model represents some tokenization algorithm like BPE or Word
|
||||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
@ -21,6 +22,19 @@ impl Model {
|
|||||||
"Cannot create a Model directly. Use a concrete subclass",
|
"Cannot create a Model directly. Use a concrete subclass",
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn save(&self, folder: &str, name: &str) -> PyResult<Vec<String>> {
|
||||||
|
let saved: PyResult<Vec<_>> = ToPyResult(
|
||||||
|
self.model
|
||||||
|
.execute(|model| model.save(Path::new(folder), name)),
|
||||||
|
)
|
||||||
|
.into();
|
||||||
|
|
||||||
|
Ok(saved?
|
||||||
|
.into_iter()
|
||||||
|
.map(|path| path.to_string_lossy().into_owned())
|
||||||
|
.collect())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// BPE Model
|
/// BPE Model
|
||||||
|
@ -7,6 +7,7 @@ use std::{
|
|||||||
fs::File,
|
fs::File,
|
||||||
io::prelude::*,
|
io::prelude::*,
|
||||||
io::{BufRead, BufReader},
|
io::{BufRead, BufReader},
|
||||||
|
path::{Path, PathBuf},
|
||||||
};
|
};
|
||||||
|
|
||||||
pub struct BPE {
|
pub struct BPE {
|
||||||
@ -260,6 +261,37 @@ impl Model for BPE {
|
|||||||
fn id_to_token(&self, id: u32) -> Option<String> {
|
fn id_to_token(&self, id: u32) -> Option<String> {
|
||||||
self.vocab_r.get(&id).cloned()
|
self.vocab_r.get(&id).cloned()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn save(&self, folder: &Path, name: &str) -> Result<Vec<PathBuf>> {
|
||||||
|
// Write vocab.json
|
||||||
|
let vocab_path: PathBuf = [folder, Path::new(&format!("{}-vocab.json", name))]
|
||||||
|
.iter()
|
||||||
|
.collect();
|
||||||
|
let mut vocab_file = File::create(&vocab_path)?;
|
||||||
|
let serialized = serde_json::to_string(&self.vocab)?;
|
||||||
|
vocab_file.write_all(&serialized.as_bytes())?;
|
||||||
|
|
||||||
|
// Write merges.txt
|
||||||
|
let merges_path: PathBuf = [folder, Path::new(&format!("{}-merges.txt", name))]
|
||||||
|
.iter()
|
||||||
|
.collect();
|
||||||
|
let mut merges_file = File::create(&merges_path)?;
|
||||||
|
let mut merges: Vec<(Pair, u32)> = self
|
||||||
|
.merges
|
||||||
|
.iter()
|
||||||
|
.map(|(pair, (rank, _))| (*pair, *rank))
|
||||||
|
.collect();
|
||||||
|
merges.sort_unstable_by_key(|k| k.1);
|
||||||
|
merges_file.write_all(
|
||||||
|
&merges
|
||||||
|
.into_iter()
|
||||||
|
.map(|(pair, _)| format!("{} {}\n", pair.0, pair.1).into_bytes())
|
||||||
|
.flatten()
|
||||||
|
.collect::<Vec<_>>()[..],
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(vec![vocab_path, merges_path])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -3,7 +3,9 @@ use std::{
|
|||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
fmt,
|
fmt,
|
||||||
fs::File,
|
fs::File,
|
||||||
|
io::prelude::*,
|
||||||
io::{BufRead, BufReader},
|
io::{BufRead, BufReader},
|
||||||
|
path::{Path, PathBuf},
|
||||||
};
|
};
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
@ -145,6 +147,25 @@ impl Model for WordPiece {
|
|||||||
fn id_to_token(&self, id: u32) -> Option<String> {
|
fn id_to_token(&self, id: u32) -> Option<String> {
|
||||||
self.vocab_r.get(&id).cloned()
|
self.vocab_r.get(&id).cloned()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn save(&self, folder: &Path, name: &str) -> Result<Vec<PathBuf>> {
|
||||||
|
// Write vocab.txt
|
||||||
|
let vocab_path: PathBuf = [folder, Path::new(&format!("{}-vocab.txt", name))]
|
||||||
|
.iter()
|
||||||
|
.collect();
|
||||||
|
let mut vocab_file = File::create(&vocab_path)?;
|
||||||
|
let mut vocab: Vec<(&String, &u32)> = self.vocab.iter().collect();
|
||||||
|
vocab.sort_unstable_by_key(|k| *k.1);
|
||||||
|
vocab_file.write_all(
|
||||||
|
&vocab
|
||||||
|
.into_iter()
|
||||||
|
.map(|(token, _)| token.as_bytes().to_owned())
|
||||||
|
.flatten()
|
||||||
|
.collect::<Vec<_>>()[..],
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(vec![vocab_path])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -18,6 +18,7 @@ use std::{
|
|||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
fs::File,
|
fs::File,
|
||||||
io::{BufRead, BufReader},
|
io::{BufRead, BufReader},
|
||||||
|
path::{Path, PathBuf},
|
||||||
};
|
};
|
||||||
|
|
||||||
mod encoding;
|
mod encoding;
|
||||||
@ -40,6 +41,7 @@ pub trait Model {
|
|||||||
fn token_to_id(&self, token: &str) -> Option<u32>;
|
fn token_to_id(&self, token: &str) -> Option<u32>;
|
||||||
fn id_to_token(&self, id: u32) -> Option<String>;
|
fn id_to_token(&self, id: u32) -> Option<String>;
|
||||||
fn get_vocab_size(&self) -> usize;
|
fn get_vocab_size(&self) -> usize;
|
||||||
|
fn save(&self, folder: &Path, name: &str) -> Result<Vec<PathBuf>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
|
/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
|
||||||
|
Reference in New Issue
Block a user