mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add failing test for from_file
This commit is contained in:
24
bindings/python/Cargo.lock
generated
24
bindings/python/Cargo.lock
generated
@ -618,6 +618,14 @@ name = "regex-syntax"
|
||||
version = "0.6.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "remove_dir_all"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.5"
|
||||
@ -676,6 +684,19 @@ dependencies = [
|
||||
"unicode-xid 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"libc 0.2.74 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rand 0.7.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"redox_syscall 0.1.57 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"remove_dir_all 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"winapi 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "termcolor"
|
||||
version = "1.1.0"
|
||||
@ -750,6 +771,7 @@ dependencies = [
|
||||
"rayon 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_json 1.0.57 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"tokenizers 0.10.1",
|
||||
]
|
||||
|
||||
@ -891,6 +913,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
"checksum redox_syscall 0.1.57 (registry+https://github.com/rust-lang/crates.io-index)" = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
|
||||
"checksum regex 1.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "9c3780fcf44b193bc4d09f36d2a3c87b251da4a046c87795a0d35f4f927ad8e6"
|
||||
"checksum regex-syntax 0.6.18 (registry+https://github.com/rust-lang/crates.io-index)" = "26412eb97c6b088a6997e05f69403a802a92d520de2f8e63c2b65f9e0f47c4e8"
|
||||
"checksum remove_dir_all 0.5.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3acd125665422973a33ac9d3dd2df85edad0f4ae9b00dafb1a05e43a9f5ef8e7"
|
||||
"checksum ryu 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e"
|
||||
"checksum scopeguard 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
||||
"checksum serde 1.0.114 (registry+https://github.com/rust-lang/crates.io-index)" = "5317f7588f0a5078ee60ef675ef96735a1442132dc645eb1d12c018620ed8cd3"
|
||||
@ -899,6 +922,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
"checksum smallvec 1.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3757cb9d89161a2f24e1cf78efa0c1fcff485d18e3f55e0aa3480824ddaa0f3f"
|
||||
"checksum strsim 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
||||
"checksum syn 1.0.37 (registry+https://github.com/rust-lang/crates.io-index)" = "239f255b9e3429350f188c27b807fc9920a15eb9145230ff1a7d054c08fec319"
|
||||
"checksum tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
|
||||
"checksum termcolor 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb6bfa289a4d7c5766392812c0a1f4c1ba45afa1ad47803c11e1f407d846d75f"
|
||||
"checksum terminal_size 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)" = "9a14cd9f8c72704232f0bfc8455c0e861f0ad4eb60cc9ec8a170e231414c1e13"
|
||||
"checksum termios 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6f0fcee7b24a25675de40d5bb4de6e41b0df07bc9856295e7e2b3a3600c400c2"
|
||||
|
@ -24,6 +24,9 @@ version = "0.11"
|
||||
version = "*"
|
||||
path = "../../tokenizers"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.1"
|
||||
|
||||
[features]
|
||||
default = ["pyo3/extension-module"]
|
||||
|
||||
|
@ -422,10 +422,8 @@ impl PyTokenizer {
|
||||
|
||||
#[staticmethod]
|
||||
fn from_file(path: &str) -> PyResult<Self> {
|
||||
let tokenizer: PyResult<_> = ToPyResult(TokenizerImpl::from_file(path)).into();
|
||||
Ok(Self {
|
||||
tokenizer: tokenizer?,
|
||||
})
|
||||
let tokenizer: PyResult<_> = ToPyResult(Tokenizer::from_file(path)).into();
|
||||
Ok(Self::new(tokenizer?))
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
@ -840,3 +838,29 @@ impl PyTokenizer {
|
||||
self.tokenizer.with_decoder(decoder.clone());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
use crate::models::PyModel;
|
||||
use crate::normalizers::{PyNormalizer, PyNormalizerWrapper};
|
||||
use std::sync::Arc;
|
||||
use tempfile::NamedTempFile;
|
||||
use tk::normalizers::{Lowercase, NFKC};
|
||||
|
||||
#[test]
|
||||
fn serialize() {
|
||||
let mut tokenizer = Tokenizer::new(PyModel::new(Arc::new(
|
||||
tk::models::bpe::BPE::default().into(),
|
||||
)));
|
||||
tokenizer.with_normalizer(PyNormalizer::new(PyNormalizerWrapper::Sequence(vec![
|
||||
Arc::new(NFKC.into()),
|
||||
Arc::new(Lowercase.into()),
|
||||
])));
|
||||
|
||||
let tmp = NamedTempFile::new().unwrap().into_temp_path();
|
||||
tokenizer.save(&tmp, false).unwrap();
|
||||
|
||||
Tokenizer::from_file(&tmp).unwrap();
|
||||
}
|
||||
}
|
||||
|
@ -1121,7 +1121,7 @@ where
|
||||
}
|
||||
|
||||
/// Save the current tokenizer at the given path
|
||||
pub fn save(&self, path: &str, pretty: bool) -> Result<()> {
|
||||
pub fn save<P: AsRef<Path>>(&self, path: P, pretty: bool) -> Result<()> {
|
||||
let serialized = self.to_string(pretty)?;
|
||||
|
||||
let mut file = File::create(path)?;
|
||||
|
Reference in New Issue
Block a user