Simplify Whitespace pre_tokenizer

This commit is contained in:
Anthony MOI
2021-01-11 15:29:46 -05:00
committed by Anthony MOI
parent d94fa220b6
commit 1990f51b9f

View File

@@ -1,68 +1,29 @@
use std::fmt;
use regex::Regex; use regex::Regex;
use serde::{Deserialize, Deserializer, Serialize};
use crate::tokenizer::{ use crate::tokenizer::{
pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior, pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
}; };
use serde::de::{Error, Visitor};
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug)]
#[serde(tag = "type")] pub struct Whitespace;
pub struct Whitespace { impl_serde_unit_struct!(WhitespaceVisitor, Whitespace);
#[serde(default = "default_regex", skip)]
re: Regex,
}
fn default_regex() -> Regex {
Regex::new(r"\w+|[^\w\s]+").unwrap()
}
impl Default for Whitespace { impl Default for Whitespace {
fn default() -> Self { fn default() -> Self {
Self { Self
re: default_regex(),
}
} }
} }
impl PreTokenizer for Whitespace { impl PreTokenizer for Whitespace {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
pretokenized.split(|_, normalized| { lazy_static! {
normalized.split(Invert(&self.re), SplitDelimiterBehavior::Removed) static ref RE: Regex = Regex::new(r"\w+|[^\w\s]+").unwrap();
})
}
}
// manually implement deserialize because Whitespace is not a unit-struct but is
// serialized like one.
impl<'de> Deserialize<'de> for Whitespace {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: Deserializer<'de>,
{
deserializer.deserialize_map(WhitespaceVisitor)
}
}
struct WhitespaceVisitor;
impl<'de> Visitor<'de> for WhitespaceVisitor {
type Value = Whitespace;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(formatter, "Whitespace")
}
fn visit_map<A>(self, mut map: A) -> std::result::Result<Self::Value, A::Error>
where
A: serde::de::MapAccess<'de>,
{
let maybe_type = map.next_entry::<String, String>()?;
let maybe_type_str = maybe_type.as_ref().map(|(k, v)| (k.as_str(), v.as_str()));
match maybe_type_str {
Some(("type", "Whitespace")) => Ok(Whitespace::default()),
Some((_, ty)) => Err(Error::custom(&format!("Expected Whitespace, got {}", ty))),
None => Err(Error::custom("Expected type: Whitespace")),
} }
let re_ref: &Regex = &RE;
pretokenized.split(|_, normalized| {
normalized.split(Invert(re_ref), SplitDelimiterBehavior::Removed)
})
} }
} }