Add failing test for from_file

This commit is contained in:
Anthony MOI
2020-08-31 17:02:52 -04:00
committed by Anthony MOI
parent 76b86f6901
commit bd8dac202c
4 changed files with 56 additions and 5 deletions

View File

@ -618,6 +618,14 @@ name = "regex-syntax"
version = "0.6.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
name = "remove_dir_all"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "ryu"
version = "1.0.5"
@ -676,6 +684,19 @@ dependencies = [
"unicode-xid 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "tempfile"
version = "3.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)",
"redox_syscall 0.1.57 (registry+https://github.com/rust-lang/crates.io-index)",
"remove_dir_all 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
"winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
name = "termcolor"
version = "1.1.0"
@ -750,6 +771,7 @@ dependencies = [
"rayon 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
"serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.57 (registry+https://github.com/rust-lang/crates.io-index)",
"tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"tokenizers 0.10.1",
]
@ -891,6 +913,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum redox_syscall 0.1.57 (registry+https://github.com/rust-lang/crates.io-index)" = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
"checksum regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
"checksum regex-syntax 0.6.18 (registry+https://github.com/rust-lang/crates.io-index)" = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8"
"checksum remove_dir_all 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
"checksum ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
"checksum scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
"checksum serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)" = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3"
@ -899,6 +922,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
"checksum smallvec 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3757cb9d89161a2f24e1cf78efa0c1fcff485d18e3f55e0aa3480824ddaa0f3f"
"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
"checksum syn 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "239f255b9e3429350f188c27b807fc9920a15eb9145230ff1a7d054c08fec319"
"checksum tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
"checksum termcolor 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f"
"checksum terminal_size 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "9a14cd9f8c72704232f0bfc8455c0e861f0ad4eb60cc9ec8a170e231414c1e13"
"checksum termios 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f0fcee7b24a25675de40d5bb4de6e41b0df07bc9856295e7e2b3a3600c400c2"

View File

@ -24,6 +24,9 @@ version = "0.11"
version = "*"
path = "../../tokenizers"
[dev-dependencies]
tempfile = "3.1"
[features]
default = ["pyo3/extension-module"]

View File

@ -422,10 +422,8 @@ impl PyTokenizer {
#[staticmethod]
fn from_file(path: &str) -> PyResult<Self> {
let tokenizer: PyResult<_> = ToPyResult(TokenizerImpl::from_file(path)).into();
Ok(Self {
tokenizer: tokenizer?,
})
let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into();
Ok(Self::new(tokenizer?))
}
#[staticmethod]
@ -840,3 +838,29 @@ impl PyTokenizer {
self.tokenizer.with_decoder(decoder.clone());
}
}
#[cfg(test)]
mod test {
use super::*;
use crate::models::PyModel;
use crate::normalizers::{PyNormalizer, PyNormalizerWrapper};
use std::sync::Arc;
use tempfile::NamedTempFile;
use tk::normalizers::{Lowercase, NFKC};
#[test]
fn serialize() {
let mut tokenizer = Tokenizer::new(PyModel::new(Arc::new(
tk::models::bpe::BPE::default().into(),
)));
tokenizer.with_normalizer(PyNormalizer::new(PyNormalizerWrapper::Sequence(vec![
Arc::new(NFKC.into()),
Arc::new(Lowercase.into()),
])));
let tmp = NamedTempFile::new().unwrap().into_temp_path();
tokenizer.save(&tmp, false).unwrap();
Tokenizer::from_file(&tmp).unwrap();
}
}

View File

@ -1121,7 +1121,7 @@ where
}
/// Save the current tokenizer at the given path
pub fn save(&self, path: &str, pretty: bool) -> Result<()> {
pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()> {
let serialized = self.to_string(pretty)?;
let mut file = File::create(path)?;