mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-03 07:49:22 +00:00
Add failing test for from_file
This commit is contained in:
24
bindings/python/Cargo.lock
generated
24
bindings/python/Cargo.lock
generated
@ -618,6 +618,14 @@ name = "regex-syntax"
|
|||||||
version = "0.6.18"
|
version = "0.6.18"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "remove_dir_all"
|
||||||
|
version = "0.5.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "ryu"
|
name = "ryu"
|
||||||
version = "1.0.5"
|
version = "1.0.5"
|
||||||
@ -676,6 +684,19 @@ dependencies = [
|
|||||||
"unicode-xid 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"unicode-xid 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "tempfile"
|
||||||
|
version = "3.1.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"rand 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"redox_syscall 0.1.57 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"remove_dir_all 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "termcolor"
|
name = "termcolor"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
@ -750,6 +771,7 @@ dependencies = [
|
|||||||
"rayon 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"rayon 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde_json 1.0.57 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_json 1.0.57 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
|
"tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"tokenizers 0.10.1",
|
"tokenizers 0.10.1",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -891,6 +913,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum redox_syscall 0.1.57 (registry+https://github.com/rust-lang/crates.io-index)" = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
|
"checksum redox_syscall 0.1.57 (registry+https://github.com/rust-lang/crates.io-index)" = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
|
||||||
"checksum regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
|
"checksum regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
|
||||||
"checksum regex-syntax 0.6.18 (registry+https://github.com/rust-lang/crates.io-index)" = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8"
|
"checksum regex-syntax 0.6.18 (registry+https://github.com/rust-lang/crates.io-index)" = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8"
|
||||||
|
"checksum remove_dir_all 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
|
||||||
"checksum ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
|
"checksum ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
|
||||||
"checksum scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
"checksum scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
||||||
"checksum serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)" = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3"
|
"checksum serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)" = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3"
|
||||||
@ -899,6 +922,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum smallvec 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3757cb9d89161a2f24e1cf78efa0c1fcff485d18e3f55e0aa3480824ddaa0f3f"
|
"checksum smallvec 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3757cb9d89161a2f24e1cf78efa0c1fcff485d18e3f55e0aa3480824ddaa0f3f"
|
||||||
"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
||||||
"checksum syn 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "239f255b9e3429350f188c27b807fc9920a15eb9145230ff1a7d054c08fec319"
|
"checksum syn 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "239f255b9e3429350f188c27b807fc9920a15eb9145230ff1a7d054c08fec319"
|
||||||
|
"checksum tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
|
||||||
"checksum termcolor 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f"
|
"checksum termcolor 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f"
|
||||||
"checksum terminal_size 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "9a14cd9f8c72704232f0bfc8455c0e861f0ad4eb60cc9ec8a170e231414c1e13"
|
"checksum terminal_size 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "9a14cd9f8c72704232f0bfc8455c0e861f0ad4eb60cc9ec8a170e231414c1e13"
|
||||||
"checksum termios 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f0fcee7b24a25675de40d5bb4de6e41b0df07bc9856295e7e2b3a3600c400c2"
|
"checksum termios 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f0fcee7b24a25675de40d5bb4de6e41b0df07bc9856295e7e2b3a3600c400c2"
|
||||||
|
@ -24,6 +24,9 @@ version = "0.11"
|
|||||||
version = "*"
|
version = "*"
|
||||||
path = "../../tokenizers"
|
path = "../../tokenizers"
|
||||||
|
|
||||||
|
[dev-dependencies]
|
||||||
|
tempfile = "3.1"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = ["pyo3/extension-module"]
|
default = ["pyo3/extension-module"]
|
||||||
|
|
||||||
|
@ -422,10 +422,8 @@ impl PyTokenizer {
|
|||||||
|
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
fn from_file(path: &str) -> PyResult<Self> {
|
fn from_file(path: &str) -> PyResult<Self> {
|
||||||
let tokenizer: PyResult<_> = ToPyResult(TokenizerImpl::from_file(path)).into();
|
let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into();
|
||||||
Ok(Self {
|
Ok(Self::new(tokenizer?))
|
||||||
tokenizer: tokenizer?,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
@ -840,3 +838,29 @@ impl PyTokenizer {
|
|||||||
self.tokenizer.with_decoder(decoder.clone());
|
self.tokenizer.with_decoder(decoder.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
use super::*;
|
||||||
|
use crate::models::PyModel;
|
||||||
|
use crate::normalizers::{PyNormalizer, PyNormalizerWrapper};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tempfile::NamedTempFile;
|
||||||
|
use tk::normalizers::{Lowercase, NFKC};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn serialize() {
|
||||||
|
let mut tokenizer = Tokenizer::new(PyModel::new(Arc::new(
|
||||||
|
tk::models::bpe::BPE::default().into(),
|
||||||
|
)));
|
||||||
|
tokenizer.with_normalizer(PyNormalizer::new(PyNormalizerWrapper::Sequence(vec![
|
||||||
|
Arc::new(NFKC.into()),
|
||||||
|
Arc::new(Lowercase.into()),
|
||||||
|
])));
|
||||||
|
|
||||||
|
let tmp = NamedTempFile::new().unwrap().into_temp_path();
|
||||||
|
tokenizer.save(&tmp, false).unwrap();
|
||||||
|
|
||||||
|
Tokenizer::from_file(&tmp).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -1121,7 +1121,7 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Save the current tokenizer at the given path
|
/// Save the current tokenizer at the given path
|
||||||
pub fn save(&self, path: &str, pretty: bool) -> Result<()> {
|
pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()> {
|
||||||
let serialized = self.to_string(pretty)?;
|
let serialized = self.to_string(pretty)?;
|
||||||
|
|
||||||
let mut file = File::create(path)?;
|
let mut file = File::create(path)?;
|
||||||
|
Reference in New Issue
Block a user