mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Enabling the option to use fancy_regex instead of onig
.
This commit is contained in:
@ -17,7 +17,6 @@ env_logger = "0.11"
|
|||||||
pyo3 = { version = "0.21" }
|
pyo3 = { version = "0.21" }
|
||||||
numpy = "0.21"
|
numpy = "0.21"
|
||||||
ndarray = "0.15"
|
ndarray = "0.15"
|
||||||
onig = { version = "6.4", default-features = false }
|
|
||||||
itertools = "0.12"
|
itertools = "0.12"
|
||||||
|
|
||||||
[dependencies.tokenizers]
|
[dependencies.tokenizers]
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
use onig::Regex;
|
|
||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
|
use tk::utils::SysRegex;
|
||||||
|
|
||||||
/// Instantiate a new Regex with the given pattern
|
/// Instantiate a new Regex with the given pattern
|
||||||
#[pyclass(module = "tokenizers", name = "Regex")]
|
#[pyclass(module = "tokenizers", name = "Regex")]
|
||||||
pub struct PyRegex {
|
pub struct PyRegex {
|
||||||
pub inner: Regex,
|
pub inner: SysRegex,
|
||||||
pub pattern: String,
|
pub pattern: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -15,8 +15,8 @@ impl PyRegex {
|
|||||||
#[pyo3(text_signature = "(self, pattern)")]
|
#[pyo3(text_signature = "(self, pattern)")]
|
||||||
fn new(s: &str) -> PyResult<Self> {
|
fn new(s: &str) -> PyResult<Self> {
|
||||||
Ok(Self {
|
Ok(Self {
|
||||||
inner: Regex::new(s)
|
inner: SysRegex::new(s)
|
||||||
.map_err(|e| exceptions::PyException::new_err(e.description().to_owned()))?,
|
.map_err(|e| exceptions::PyException::new_err(e.to_string().to_owned()))?,
|
||||||
pattern: s.to_owned(),
|
pattern: s.to_owned(),
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
use crate::tokenizer::pattern::Pattern;
|
||||||
|
use crate::Offsets;
|
||||||
use fancy_regex::Regex;
|
use fancy_regex::Regex;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
|
|
||||||
@ -31,3 +33,31 @@ impl<'r, 't> Iterator for Matches<'r, 't> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Pattern for &Regex {
|
||||||
|
fn find_matches(
|
||||||
|
&self,
|
||||||
|
inside: &str,
|
||||||
|
) -> Result<Vec<(Offsets, bool)>, Box<dyn Error + Send + Sync + 'static>> {
|
||||||
|
if inside.is_empty() {
|
||||||
|
return Ok(vec![((0, 0), false)]);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut prev = 0;
|
||||||
|
let mut splits = Vec::with_capacity(inside.len());
|
||||||
|
for match_ in self.find_iter(inside) {
|
||||||
|
let match_ = match_?;
|
||||||
|
let start = match_.start();
|
||||||
|
let end = match_.end();
|
||||||
|
if prev != start {
|
||||||
|
splits.push(((prev, start), false));
|
||||||
|
}
|
||||||
|
splits.push(((start, end), true));
|
||||||
|
prev = end;
|
||||||
|
}
|
||||||
|
if prev != inside.len() {
|
||||||
|
splits.push(((prev, inside.len()), false))
|
||||||
|
}
|
||||||
|
Ok(splits)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user