mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-04 19:58:21 +00:00
Simplify Whitespace pre_tokenizer
This commit is contained in:
@@ -1,68 +1,29 @@
|
|||||||
use std::fmt;
|
|
||||||
|
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use serde::{Deserialize, Deserializer, Serialize};
|
|
||||||
|
|
||||||
use crate::tokenizer::{
|
use crate::tokenizer::{
|
||||||
pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
|
pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
|
||||||
};
|
};
|
||||||
use serde::de::{Error, Visitor};
|
|
||||||
|
|
||||||
#[derive(Clone, Debug, Serialize)]
|
#[derive(Clone, Debug)]
|
||||||
#[serde(tag = "type")]
|
pub struct Whitespace;
|
||||||
pub struct Whitespace {
|
impl_serde_unit_struct!(WhitespaceVisitor, Whitespace);
|
||||||
#[serde(default = "default_regex", skip)]
|
|
||||||
re: Regex,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn default_regex() -> Regex {
|
|
||||||
Regex::new(r"\w+|[^\w\s]+").unwrap()
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for Whitespace {
|
impl Default for Whitespace {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self
|
||||||
re: default_regex(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PreTokenizer for Whitespace {
|
impl PreTokenizer for Whitespace {
|
||||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||||
pretokenized.split(|_, normalized| {
|
lazy_static! {
|
||||||
normalized.split(Invert(&self.re), SplitDelimiterBehavior::Removed)
|
static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// manually implement deserialize because Whitespace is not a unit-struct but is
|
|
||||||
// serialized like one.
|
|
||||||
impl<'de> Deserialize<'de> for Whitespace {
|
|
||||||
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
|
|
||||||
where
|
|
||||||
D: Deserializer<'de>,
|
|
||||||
{
|
|
||||||
deserializer.deserialize_map(WhitespaceVisitor)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
struct WhitespaceVisitor;
|
|
||||||
impl<'de> Visitor<'de> for WhitespaceVisitor {
|
|
||||||
type Value = Whitespace;
|
|
||||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
write!(formatter, "Whitespace")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn visit_map<A>(self, mut map: A) -> std::result::Result<Self::Value, A::Error>
|
|
||||||
where
|
|
||||||
A: serde::de::MapAccess<'de>,
|
|
||||||
{
|
|
||||||
let maybe_type = map.next_entry::<String, String>()?;
|
|
||||||
let maybe_type_str = maybe_type.as_ref().map(|(k, v)| (k.as_str(), v.as_str()));
|
|
||||||
match maybe_type_str {
|
|
||||||
Some(("type", "Whitespace")) => Ok(Whitespace::default()),
|
|
||||||
Some((_, ty)) => Err(Error::custom(&format!("Expected Whitespace, got {}", ty))),
|
|
||||||
None => Err(Error::custom("Expected type: Whitespace")),
|
|
||||||
}
|
}
|
||||||
|
let re_ref: &Regex = &RE;
|
||||||
|
|
||||||
|
pretokenized.split(|_, normalized| {
|
||||||
|
normalized.split(Invert(re_ref), SplitDelimiterBehavior::Removed)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user