Enabling the option to use fancy_regex instead of onig.

This commit is contained in:
Nicolas Patry
2024-08-01 12:10:16 +02:00
parent 9e0c791f2b
commit 7415e28536
3 changed files with 34 additions and 5 deletions

View File

@ -17,7 +17,6 @@ env_logger = "0.11"
pyo3 = { version = "0.21" } pyo3 = { version = "0.21" }
numpy = "0.21" numpy = "0.21"
ndarray = "0.15" ndarray = "0.15"
onig = { version = "6.4", default-features = false }
itertools = "0.12" itertools = "0.12"
[dependencies.tokenizers] [dependencies.tokenizers]

View File

@ -1,11 +1,11 @@
use onig::Regex;
use pyo3::exceptions; use pyo3::exceptions;
use pyo3::prelude::*; use pyo3::prelude::*;
use tk::utils::SysRegex;
/// Instantiate a new Regex with the given pattern /// Instantiate a new Regex with the given pattern
#[pyclass(module = "tokenizers", name = "Regex")] #[pyclass(module = "tokenizers", name = "Regex")]
pub struct PyRegex { pub struct PyRegex {
pub inner: Regex, pub inner: SysRegex,
pub pattern: String, pub pattern: String,
} }
@ -15,8 +15,8 @@ impl PyRegex {
#[pyo3(text_signature = "(self, pattern)")] #[pyo3(text_signature = "(self, pattern)")]
fn new(s: &str) -> PyResult<Self> { fn new(s: &str) -> PyResult<Self> {
Ok(Self { Ok(Self {
inner: Regex::new(s) inner: SysRegex::new(s)
.map_err(|e| exceptions::PyException::new_err(e.description().to_owned()))?, .map_err(|e| exceptions::PyException::new_err(e.to_string().to_owned()))?,
pattern: s.to_owned(), pattern: s.to_owned(),
}) })
} }

View File

@ -1,3 +1,5 @@
use crate::tokenizer::pattern::Pattern;
use crate::Offsets;
use fancy_regex::Regex; use fancy_regex::Regex;
use std::error::Error; use std::error::Error;
@ -31,3 +33,31 @@ impl<'r, 't> Iterator for Matches<'r, 't> {
} }
} }
} }
impl Pattern for &Regex {
fn find_matches(
&self,
inside: &str,
) -> Result<Vec<(Offsets, bool)>, Box<dyn Error + Send + Sync + 'static>> {
if inside.is_empty() {
return Ok(vec![((0, 0), false)]);
}
let mut prev = 0;
let mut splits = Vec::with_capacity(inside.len());
for match_ in self.find_iter(inside) {
let match_ = match_?;
let start = match_.start();
let end = match_.end();
if prev != start {
splits.push(((prev, start), false));
}
splits.push(((start, end), true));
prev = end;
}
if prev != inside.len() {
splits.push(((prev, inside.len()), false))
}
Ok(splits)
}
}