mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Python - Update pre tokenizers with offsets
This commit is contained in:
6
bindings/python/Cargo.lock
generated
6
bindings/python/Cargo.lock
generated
@ -456,14 +456,14 @@ dependencies = [
|
|||||||
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
"regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"serde_json 1.0.44 (registry+https://github.com/rust-lang/crates.io-index)",
|
"serde_json 1.0.44 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
"unicode-normalization 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)",
|
"unicode-normalization 0.1.11 (git+https://github.com/n1t0/unicode-normalization)",
|
||||||
"unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
"unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-normalization"
|
name = "unicode-normalization"
|
||||||
version = "0.1.11"
|
version = "0.1.11"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/n1t0/unicode-normalization#894053d92493c55c89fe9b188c0fb2babaa9a84c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
"smallvec 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||||
]
|
]
|
||||||
@ -570,7 +570,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
"checksum syn 1.0.11 (registry+https://github.com/rust-lang/crates.io-index)" = "dff0acdb207ae2fe6d5976617f887eb1e35a2ba52c13c7234c790960cdad9238"
|
"checksum syn 1.0.11 (registry+https://github.com/rust-lang/crates.io-index)" = "dff0acdb207ae2fe6d5976617f887eb1e35a2ba52c13c7234c790960cdad9238"
|
||||||
"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
"checksum textwrap 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||||
"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b"
|
"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b"
|
||||||
"checksum unicode-normalization 0.1.11 (registry+https://github.com/rust-lang/crates.io-index)" = "b561e267b2326bb4cebfc0ef9e68355c7abe6c6f522aeac2f5bf95d56c59bdcf"
|
"checksum unicode-normalization 0.1.11 (git+https://github.com/n1t0/unicode-normalization)" = "<none>"
|
||||||
"checksum unicode-width 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
|
"checksum unicode-width 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "caaa9d531767d1ff2150b9332433f32a24622147e5ebb1f26409d5da67afd479"
|
||||||
"checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
|
"checksum unicode-xid 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "826e7639553986605ec5979c7dd957c7895e93eabed50ab2ffa7f6128a75097c"
|
||||||
"checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
|
"checksum unicode_categories 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
|
||||||
|
@ -5,7 +5,7 @@ use super::utils::Container;
|
|||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use tk::tokenizer::Result;
|
use tk::tokenizer::{Offsets, Result};
|
||||||
|
|
||||||
#[pyclass(dict)]
|
#[pyclass(dict)]
|
||||||
pub struct PreTokenizer {
|
pub struct PreTokenizer {
|
||||||
@ -21,7 +21,7 @@ impl PreTokenizer {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<String>> {
|
fn pre_tokenize(&self, s: &str) -> PyResult<Vec<(String, Offsets)>> {
|
||||||
ToPyResult(self.pretok.execute(|pretok| pretok.pre_tokenize(s))).into()
|
ToPyResult(self.pretok.execute(|pretok| pretok.pre_tokenize(s))).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -58,36 +58,9 @@ pub struct BertPreTokenizer {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl BertPreTokenizer {
|
impl BertPreTokenizer {
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
#[args(kwargs = "**")]
|
fn new() -> PyResult<PreTokenizer> {
|
||||||
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
|
||||||
let mut do_basic_tokenize = true;
|
|
||||||
let mut do_lower_case = true;
|
|
||||||
let mut never_split = HashSet::new();
|
|
||||||
let mut tokenize_chinese_chars = true;
|
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
|
||||||
for (key, val) in kwargs {
|
|
||||||
let key: &str = key.extract()?;
|
|
||||||
match key {
|
|
||||||
"do_basic_tokenize" => do_basic_tokenize = val.extract()?,
|
|
||||||
"do_lower_case" => do_lower_case = val.extract()?,
|
|
||||||
"tokenize_chinese_chars" => tokenize_chinese_chars = val.extract()?,
|
|
||||||
"never_split" => {
|
|
||||||
let values: Vec<String> = val.extract()?;
|
|
||||||
never_split = values.into_iter().collect();
|
|
||||||
}
|
|
||||||
_ => println!("Ignored unknown kwargs option {}", key),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(PreTokenizer {
|
Ok(PreTokenizer {
|
||||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer::new(
|
pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer)),
|
||||||
do_basic_tokenize,
|
|
||||||
do_lower_case,
|
|
||||||
never_split,
|
|
||||||
tokenize_chinese_chars,
|
|
||||||
))),
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -104,7 +77,7 @@ impl PyPreTokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
||||||
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<String>> {
|
fn pre_tokenize(&self, sentence: &str) -> Result<Vec<(String, Offsets)>> {
|
||||||
let gil = Python::acquire_gil();
|
let gil = Python::acquire_gil();
|
||||||
let py = gil.python();
|
let py = gil.python();
|
||||||
|
|
||||||
@ -112,9 +85,15 @@ impl tk::tokenizer::PreTokenizer for PyPreTokenizer {
|
|||||||
match self.class.call_method(py, "pre_tokenize", args, None) {
|
match self.class.call_method(py, "pre_tokenize", args, None) {
|
||||||
Ok(res) => Ok(res
|
Ok(res) => Ok(res
|
||||||
.cast_as::<PyList>(py)
|
.cast_as::<PyList>(py)
|
||||||
.map_err(|_| PyError::from("`pre_tokenize is expected to return a List[str]"))?
|
.map_err(|_| {
|
||||||
.extract::<Vec<String>>()
|
PyError::from("`pre_tokenize is expected to return a List[(str, (uint, uint))]")
|
||||||
.map_err(|_| PyError::from("`pre_tokenize` is expected to return a List[str]"))?),
|
})?
|
||||||
|
.extract::<Vec<(String, Offsets)>>()
|
||||||
|
.map_err(|_| {
|
||||||
|
PyError::from(
|
||||||
|
"`pre_tokenize` is expected to return a List[(str, (uint, uint))]",
|
||||||
|
)
|
||||||
|
})?),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
e.print(py);
|
e.print(py);
|
||||||
Err(Box::new(PyError::from(
|
Err(Box::new(PyError::from(
|
||||||
|
Reference in New Issue
Block a user