Python - Update pre tokenizers with offsets

2025-12-09 22:28:29 +00:00 · 2019-12-29 00:37:58 -05:00
parent 3f79d9d5e0
commit 3dcf9f763c
2 changed files with 17 additions and 38 deletions
--- a/bindings/python/Cargo.lock
+++ b/bindings/python/Cargo.lock
@@ -456,14 +456,14 @@ dependencies = [
 "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
 "regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)",
 "serde_json 1.0.44 (registry+https://github.com/rust-lang/crates.io-index)",
- "unicode-normalization 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
+ "unicode-normalization 0.1.11 (git+https://github.com/n1t0/unicode-normalization)",
 "unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 [[package]]
 name = "unicode-normalization"
 version = "0.1.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
+source = "git+https://github.com/n1t0/unicode-normalization#894053d92493c55c89fe9b188c0fb2babaa9a84c"
 dependencies = [
 "smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
@@ -570,7 +570,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 "checksum syn 1.0.11 (registry+https://github.com/rust-lang/crates.io-index)" = "dff0acdb207ae2fe6d5976617f887eb1e35a2ba52c13c7234c790960cdad9238"
 "checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
 "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b"
-"checksum unicode-normalization 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "b561e267b2326bb4cebfc0ef9e68355c7abe6c6f522aeac2f5bf95d56c59bdcf"
+"checksum unicode-normalization 0.1.11 (git+https://github.com/n1t0/unicode-normalization)" = "<none>"
 "checksum unicode-width 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
 "checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
 "checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -5,7 +5,7 @@ use super::utils::Container;
 use pyo3::prelude::*;
 use pyo3::types::*;
 use std::collections::HashSet;
-use tk::tokenizer::Result;
+use tk::tokenizer::{Offsets, Result};
 #[pyclass(dict)]
 pub struct PreTokenizer {
@@ -21,7 +21,7 @@ impl PreTokenizer {
        })
    }
-    fn pre_tokenize(&self, s: &str) -> PyResult<Vec<String>> {
+    fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
        ToPyResult(self.pretok.execute(|pretok| pretok.pre_tokenize(s))).into()
    }
 }
@@ -58,36 +58,9 @@ pub struct BertPreTokenizer {}
 #[pymethods]
 impl BertPreTokenizer {
    #[staticmethod]
-    #[args(kwargs = "**")]
+    fn new() -> PyResult<PreTokenizer> {
    fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
        let mut do_basic_tokenize = true;
        let mut do_lower_case = true;
        let mut never_split = HashSet::new();
        let mut tokenize_chinese_chars = true;
        if let Some(kwargs) = kwargs {
            for (key, val) in kwargs {
                let key: &str = key.extract()?;
                match key {
                    "do_basic_tokenize" => do_basic_tokenize = val.extract()?,
                    "do_lower_case" => do_lower_case = val.extract()?,
                    "tokenize_chinese_chars" => tokenize_chinese_chars = val.extract()?,
                    "never_split" => {
                        let values: Vec<String> = val.extract()?;
                        never_split = values.into_iter().collect();
                    }
                    _ => println!("Ignored unknown kwargs option {}", key),
                }
            }
        }
        Ok(PreTokenizer {
-            pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer::new(
+            pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer)),
                do_basic_tokenize,
                do_lower_case,
                never_split,
                tokenize_chinese_chars,
            ))),
        })
    }
 }
@@ -104,7 +77,7 @@ impl PyPreTokenizer {
 }
 impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
-    fn pre_tokenize(&self, sentence: &str) -> Result<Vec<String>> {
+    fn pre_tokenize(&self, sentence: &str) -> Result<Vec<(String, Offsets)>> {
        let gil = Python::acquire_gil();
        let py = gil.python();
@@ -112,9 +85,15 @@ impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
        match self.class.call_method(py, "pre_tokenize", args, None) {
            Ok(res) => Ok(res
                .cast_as::<PyList>(py)
-                .map_err(|_| PyError::from("`pre_tokenize is expected to return a List[str]"))?
+                .map_err(|_| {
-                .extract::<Vec<String>>()
+                    PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
-                .map_err(|_| PyError::from("`pre_tokenize` is expected to return a List[str]"))?),
+                })?
                .extract::<Vec<(String, Offsets)>>()
                .map_err(|_| {
                    PyError::from(
                        "`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
                    )
                })?),
            Err(e) => {
                e.print(py);
                Err(Box::new(PyError::from(