diff --git a/bindings/node/lib/bindings/pre-tokenizers.d.ts b/bindings/node/lib/bindings/pre-tokenizers.d.ts index cd842f63..17971486 100644 --- a/bindings/node/lib/bindings/pre-tokenizers.d.ts +++ b/bindings/node/lib/bindings/pre-tokenizers.d.ts @@ -39,6 +39,24 @@ export function whitespacePreTokenizer(): PreTokenizer; */ export function whitespaceSplitPreTokenizer(): PreTokenizer; +/** + * Returns a Split PreTokenizer + * This versatile pre-tokenizer splits using the provided pattern and + * according to the provided behavior. The pattern can be inverted by + * making use of the invert flag. + * + * @param [pattern] A pattern used to split the string. Usually a string or a Regex. + * @param [behavior] The behavior to use when splitting. + * Choices: "removed", "isolated", "mergedWithPrevious", "mergedWithNext", + * "contiguous". + * @param [invert=false] Whether to invert the pattern. + */ +export function splitPreTokenizer( + pattern?: string, + behavior?: string, + invert?: boolean +): PreTokenizer; + /** * Returns a new Bert PreTokenizer. * This pre-tokenizer splits tokens on spaces, and also on punctuation. diff --git a/bindings/node/lib/bindings/pre-tokenizers.js b/bindings/node/lib/bindings/pre-tokenizers.js index 700c2564..c4fbb223 100644 --- a/bindings/node/lib/bindings/pre-tokenizers.js +++ b/bindings/node/lib/bindings/pre-tokenizers.js @@ -11,4 +11,5 @@ module.exports = { punctuationPreTokenizer: native.pre_tokenizers_Punctuation, sequencePreTokenizer: native.pre_tokenizers_Sequence, digitsPreTokenizer: native.pre_tokenizers_Digits, + splitPreTokenizer: native.pre_tokenizers_Split, }; diff --git a/bindings/node/lib/bindings/pre-tokenizers.test.ts b/bindings/node/lib/bindings/pre-tokenizers.test.ts index eae8f218..39a6eb40 100644 --- a/bindings/node/lib/bindings/pre-tokenizers.test.ts +++ b/bindings/node/lib/bindings/pre-tokenizers.test.ts @@ -3,6 +3,7 @@ import { metaspacePreTokenizer, punctuationPreTokenizer, sequencePreTokenizer, + splitPreTokenizer, whitespaceSplitPreTokenizer, } from "./pre-tokenizers"; @@ -44,6 +45,13 @@ describe("punctuationPreTokenizer", () => { }); }); +describe("splitPreTokenizer", () => { + it("instantiates correctly with invert parameter", () => { + const processor = splitPreTokenizer(" ", "mergedWithPrevious", false); + expect(processor.constructor.name).toEqual("PreTokenizer"); + }); +}); + describe("sequencePreTokenizer", () => { it("instantiates correctly", () => { const punctuation = punctuationPreTokenizer(); diff --git a/bindings/node/native/src/pre_tokenizers.rs b/bindings/node/native/src/pre_tokenizers.rs index d5f13781..de88c922 100644 --- a/bindings/node/native/src/pre_tokenizers.rs +++ b/bindings/node/native/src/pre_tokenizers.rs @@ -5,9 +5,38 @@ use neon::prelude::*; use std::sync::Arc; use serde::{ser::SerializeStruct, Serialize, Serializer}; +use tk::normalizer::SplitDelimiterBehavior; use tk::pre_tokenizers::PreTokenizerWrapper; use tk::PreTokenizedString; +#[derive(Clone)] +struct JsSplitDelimiterBehavior(SplitDelimiterBehavior); + +impl FromJsValue for JsSplitDelimiterBehavior { + fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, _cx: &mut C) -> LibResult { + let s = from.downcast::()?.value(); + + Ok(Self(match s.as_ref() { + "removed" => Ok(SplitDelimiterBehavior::Removed), + "isolated" => Ok(SplitDelimiterBehavior::Isolated), + "mergedWithPrevious" => Ok(SplitDelimiterBehavior::MergedWithPrevious), + "mergedWithNext" => Ok(SplitDelimiterBehavior::MergedWithNext), + "contiguous" => Ok(SplitDelimiterBehavior::Contiguous), + _ => Err(Error( + "Wrong value for SplitDelimiterBehavior, expected one of: \ + `removed, isolated, mergedWithPrevious, mergedWithNext, contiguous`" + .into(), + )), + }?)) + } +} + +impl<'s> From for SplitDelimiterBehavior { + fn from(v: JsSplitDelimiterBehavior) -> Self { + v.0 + } +} + #[derive(Clone, Debug, Deserialize)] #[serde(untagged)] pub enum JsPreTokenizerWrapper { @@ -156,6 +185,22 @@ fn metaspace(mut cx: FunctionContext) -> JsResult { Ok(pretok) } +/// split(invert: bool = false) +fn split(mut cx: FunctionContext) -> JsResult { + let pattern: String = cx.extract::(0)?; + let behavior: JsSplitDelimiterBehavior = cx.extract::(1)?; + let invert: bool = cx.extract_opt::(2)?.unwrap_or(false); + + let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?; + let guard = cx.lock(); + pretok.borrow_mut(&guard).pretok = Some( + tk::pre_tokenizers::split::Split::new(pattern, behavior.into(), invert) + .map_err(|e| Error(e.to_string()))? + .into(), + ); + Ok(pretok) +} + /// punctuation() fn punctuation(mut cx: FunctionContext) -> JsResult { let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?; @@ -231,6 +276,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> { m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?; m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?; m.export_function(&format!("{}_Metaspace", prefix), metaspace)?; + m.export_function(&format!("{}_Split", prefix), split)?; m.export_function( &format!("{}_CharDelimiterSplit", prefix), char_delimiter_split, diff --git a/bindings/python/CHANGELOG.md b/bindings/python/CHANGELOG.md index 2185dfcc..e0c7fea8 100644 --- a/bindings/python/CHANGELOG.md +++ b/bindings/python/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [Unreleased] + +### Added +- [#542]: Add Split pre-tokenizer to easily split using a pattern + ## [0.9.4] ### Fixed @@ -270,6 +275,7 @@ delimiter (Works like `.split(delimiter)`) - Fix a bug that was causing crashes in Python 3.5 +[#542]: https://github.com/huggingface/tokenizers/pull/542 [#506]: https://github.com/huggingface/tokenizers/pull/506 [#500]: https://github.com/huggingface/tokenizers/pull/500 [#498]: https://github.com/huggingface/tokenizers/pull/498 diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py index 10a5eb1d..48277f0d 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py @@ -9,6 +9,7 @@ Digits = pre_tokenizers.Digits Metaspace = pre_tokenizers.Metaspace Punctuation = pre_tokenizers.Punctuation Sequence = pre_tokenizers.Sequence +Split = pre_tokenizers.Split UnicodeScripts = pre_tokenizers.UnicodeScripts Whitespace = pre_tokenizers.Whitespace WhitespaceSplit = pre_tokenizers.WhitespaceSplit diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index 65769095..86cb343f 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -392,6 +392,40 @@ class Sequence(PreTokenizer): """ pass +class Split(PreTokenizer): + """ + Split PreTokenizer + + This versatile pre-tokenizer splits using the provided pattern and + according to the provided behavior. The pattern can be inverted by + making use of the invert flag. + + Args: + pattern (:obj:`str` or :class:`~tokenizers.Regex`): + A pattern used to split the string. Usually a string or a Regex + + behavior (:class:`~tokenizers.SplitDelimiterBehavior`): + The behavior to use when splitting. + Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", + "contiguous" + + invert (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to invert the pattern. + """ + + def __init__(self, pattern, behavior, invert=False): + pass + def pre_tokenize(self, pretok): + """ + Pre tokenize the given PreTokenizedString in-place + """ + pass + def pre_tokenize_str(self, sequence): + """ + Pre tokenize the given sequence + """ + pass + class UnicodeScripts(PreTokenizer): """ This pre-tokenizer splits on characters that belong to different language family diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 0693d1e3..d7385d9b 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -1,5 +1,6 @@ [build-system] requires = ["setuptools", "wheel", "setuptools-rust"] +build-backend = "setuptools.build_meta" [tool.black] target-version = ['py35'] diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index d43de428..be543206 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -67,6 +67,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 625637a0..c118871c 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -12,6 +12,7 @@ use tk::pre_tokenizers::delimiter::CharDelimiterSplit; use tk::pre_tokenizers::digits::Digits; use tk::pre_tokenizers::metaspace::Metaspace; use tk::pre_tokenizers::punctuation::Punctuation; +use tk::pre_tokenizers::split::Split; use tk::pre_tokenizers::unicode_scripts::UnicodeScripts; use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit}; use tk::pre_tokenizers::PreTokenizerWrapper; @@ -53,6 +54,7 @@ impl PyPreTokenizer { PreTokenizerWrapper::Whitespace(_) => { Py::new(py, (PyWhitespace {}, base))?.into_py(py) } + PreTokenizerWrapper::Split(_) => Py::new(py, (PySplit {}, base))?.into_py(py), PreTokenizerWrapper::Punctuation(_) => { Py::new(py, (PyPunctuation {}, base))?.into_py(py) } @@ -238,6 +240,48 @@ impl PyWhitespaceSplit { } } +/// Split PreTokenizer +/// +/// This versatile pre-tokenizer splits using the provided pattern and +/// according to the provided behavior. The pattern can be inverted by +/// making use of the invert flag. +/// +/// Args: +/// pattern (:obj:`str` or :class:`~tokenizers.Regex`): +/// A pattern used to split the string. Usually a string or a Regex +/// +/// behavior (:class:`~tokenizers.SplitDelimiterBehavior`): +/// The behavior to use when splitting. +/// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next", +/// "contiguous" +/// +/// invert (:obj:`bool`, `optional`, defaults to :obj:`False`): +/// Whether to invert the pattern. +#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Split)] +#[text_signature = "(self, pattern, behavior, invert=False)"] +pub struct PySplit {} +#[pymethods] +impl PySplit { + #[new] + #[args(invert = false)] + fn new( + pattern: PyPattern, + behavior: PySplitDelimiterBehavior, + invert: bool, + ) -> PyResult<(Self, PyPreTokenizer)> { + Ok(( + PySplit {}, + ToPyResult(Split::new(pattern, behavior.into(), invert)) + .into_py()? + .into(), + )) + } + + fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> { + Ok(PyTuple::new(py, &[" ", "removed"])) + } +} + /// This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)` /// /// Args: diff --git a/bindings/python/src/utils/normalization.rs b/bindings/python/src/utils/normalization.rs index 59746af1..2aa0b388 100644 --- a/bindings/python/src/utils/normalization.rs +++ b/bindings/python/src/utils/normalization.rs @@ -8,6 +8,7 @@ use pyo3::{PyMappingProtocol, PyObjectProtocol}; use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehavior}; use tk::pattern::Pattern; +/// Represents a Pattern as used by `NormalizedString` #[derive(Clone, FromPyObject)] pub enum PyPattern<'p> { #[pyo3(annotation = "str")] @@ -44,6 +45,15 @@ impl From> for tk::normalizers::replace::ReplacePattern { } } +impl From> for tk::pre_tokenizers::split::SplitPattern { + fn from(pattern: PyPattern<'_>) -> Self { + match pattern { + PyPattern::Str(s) => Self::String(s.to_owned()), + PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())), + } + } +} + #[derive(Debug, Clone, FromPyObject)] pub enum PyRange<'s> { #[pyo3(annotation = "int")] diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index fabbdbbb..d8e86395 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -13,6 +13,7 @@ from tokenizers.pre_tokenizers import ( Sequence, Digits, UnicodeScripts, + Split, ) @@ -30,6 +31,22 @@ class TestByteLevel: assert len(ByteLevel.alphabet()) == 256 +class TestSplit: + def test_instantiate(self): + pre_tokenizer = Split(pattern=" ", behavior="removed") + assert pre_tokenizer is not None + assert isinstance(pre_tokenizer, PreTokenizer) + assert isinstance(pre_tokenizer, Split) + assert isinstance(pickle.loads(pickle.dumps(Split(" ", "removed"))), Split) + + # test with invert=True + pre_tokenizer_with_invert = Split(pattern=" ", behavior="isolated", invert=True) + assert pre_tokenizer_with_invert is not None + assert isinstance(pre_tokenizer_with_invert, PreTokenizer) + assert isinstance(pre_tokenizer_with_invert, Split) + assert isinstance(pickle.loads(pickle.dumps(Split(" ", "removed", True))), Split) + + class TestWhitespace: def test_instantiate(self): assert Whitespace() is not None diff --git a/docs/source/components.rst b/docs/source/components.rst index f26d66ee..7ba1bb09 100644 --- a/docs/source/components.rst +++ b/docs/source/components.rst @@ -21,6 +21,16 @@ to customize its behavior. This page lists most provided components. ``Sequence([NFKC(), Lowercase()])`` PreTokenizer.Sequence ``Sequence([Punctuation(), WhitespaceSplit()])`` + SplitDelimiterBehavior.removed + :obj:`removed` + SplitDelimiterBehavior.isolated + :obj:`isolated` + SplitDelimiterBehavior.merged_with_previous + :obj:`merged_with_previous` + SplitDelimiterBehavior.merged_with_next + :obj:`merged_with_next` + SplitDelimiterBehavior.contiguous + :obj:`contiguous` .. entities:: rust @@ -36,6 +46,16 @@ to customize its behavior. This page lists most provided components. ``Sequence::new(vec![NFKC, Lowercase])`` PreTokenizer.Sequence ``Sequence::new(vec![Punctuation, WhitespaceSplit])`` + SplitDelimiterBehavior.removed + :obj:`Removed` + SplitDelimiterBehavior.isolated + :obj:`Isolated` + SplitDelimiterBehavior.merged_with_previous + :obj:`MergedWithPrevious` + SplitDelimiterBehavior.merged_with_next + :obj:`MergedWithNext` + SplitDelimiterBehavior.contiguous + :obj:`Contiguous` .. entities:: node @@ -51,6 +71,16 @@ to customize its behavior. This page lists most provided components. .. PreTokenizer.Sequence .. + SplitDelimiterBehavior.removed + :obj:`removed` + SplitDelimiterBehavior.isolated + :obj:`isolated` + SplitDelimiterBehavior.merged_with_previous + :obj:`mergedWithPrevious` + SplitDelimiterBehavior.merged_with_next + :obj:`mergedWithNext` + SplitDelimiterBehavior.contiguous + :obj:`contiguous` Normalizers ---------------------------------------------------------------------------------------------------- @@ -203,6 +233,27 @@ the ByteLevel) Output: ```"Hello", "123", "there"``` + * - Split + - Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. + The pattern can be inverted if necessary. + + - pattern should be either a custom string or regexp. + - behavior should be one of: + + * :entity:`SplitDelimiterBehavior.removed` + * :entity:`SplitDelimiterBehavior.isolated` + * :entity:`SplitDelimiterBehavior.merged_with_previous` + * :entity:`SplitDelimiterBehavior.merged_with_next` + * :entity:`SplitDelimiterBehavior.contiguous` + + - invert should be a boolean flag. + + - Example with `pattern` = :obj:`" "`, `behavior` = :obj:`"isolated"`, `invert` = :obj:`False`: + + Input: ``"Hello, how are you?"`` + + Output: ```"Hello,", " ", "how", " ", "are", " ", "you?"``` + * - Sequence - Lets you compose multiple ``PreTokenizer`` that will be run in the given order - :entity:`PreTokenizer.Sequence` diff --git a/tokenizers/src/normalizers/replace.rs b/tokenizers/src/normalizers/replace.rs index 2f82d1a8..e9efafb1 100644 --- a/tokenizers/src/normalizers/replace.rs +++ b/tokenizers/src/normalizers/replace.rs @@ -21,7 +21,7 @@ impl From<&str> for ReplacePattern { } } -/// We use this custom deserializer to provided the value for `regex` for `Replace` +/// We use this custom deserializer to provide the value for `regex` for `Replace` #[doc(hidden)] #[derive(Deserialize)] #[serde(tag = "type")] diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 7197d3d4..8c8e483d 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -5,6 +5,7 @@ pub mod digits; pub mod metaspace; pub mod punctuation; pub mod sequence; +pub mod split; pub mod unicode_scripts; pub mod whitespace; @@ -17,6 +18,7 @@ use crate::pre_tokenizers::digits::Digits; use crate::pre_tokenizers::metaspace::Metaspace; use crate::pre_tokenizers::punctuation::Punctuation; use crate::pre_tokenizers::sequence::Sequence; +use crate::pre_tokenizers::split::Split; use crate::pre_tokenizers::unicode_scripts::UnicodeScripts; use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit}; use crate::{PreTokenizedString, PreTokenizer}; @@ -30,6 +32,7 @@ pub enum PreTokenizerWrapper { Metaspace(Metaspace), Whitespace(Whitespace), Sequence(Sequence), + Split(Split), Punctuation(Punctuation), WhitespaceSplit(WhitespaceSplit), Digits(Digits), @@ -46,6 +49,7 @@ impl PreTokenizer for PreTokenizerWrapper { PreTokenizerWrapper::Whitespace(wspt) => wspt.pre_tokenize(normalized), PreTokenizerWrapper::Punctuation(tok) => tok.pre_tokenize(normalized), PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized), + PreTokenizerWrapper::Split(tok) => tok.pre_tokenize(normalized), PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized), PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized), PreTokenizerWrapper::UnicodeScripts(us) => us.pre_tokenize(normalized), @@ -59,6 +63,7 @@ impl_enum_from!(CharDelimiterSplit, PreTokenizerWrapper, Delimiter); impl_enum_from!(Whitespace, PreTokenizerWrapper, Whitespace); impl_enum_from!(Punctuation, PreTokenizerWrapper, Punctuation); impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence); +impl_enum_from!(Split, PreTokenizerWrapper, Split); impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace); impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit); impl_enum_from!(Digits, PreTokenizerWrapper, Digits); diff --git a/tokenizers/src/pre_tokenizers/split.rs b/tokenizers/src/pre_tokenizers/split.rs new file mode 100644 index 00000000..69114d95 --- /dev/null +++ b/tokenizers/src/pre_tokenizers/split.rs @@ -0,0 +1,247 @@ +use onig::Regex; +use serde::{Deserialize, Serialize}; + +use crate::tokenizer::{ + pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior, +}; + +/// Represents the different patterns that `Split` can use +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum SplitPattern { + String(String), + Regex(String), +} + +impl From for SplitPattern { + fn from(v: String) -> Self { + SplitPattern::String(v) + } +} + +impl From<&str> for SplitPattern { + fn from(v: &str) -> Self { + SplitPattern::String(v.to_owned()) + } +} + +/// We use this custom deserializer to provide the value for `regex` for `Split` +#[doc(hidden)] +#[derive(Deserialize)] +#[serde(tag = "type")] +struct SplitDeserializer { + pattern: SplitPattern, + behavior: SplitDelimiterBehavior, + invert: bool, +} + +impl std::convert::TryFrom for Split { + type Error = Box; + + fn try_from(v: SplitDeserializer) -> Result { + Split::new(v.pattern, v.behavior, v.invert) + } +} + +#[derive(Debug, Serialize, Deserialize)] +#[serde(tag = "type", try_from = "SplitDeserializer")] +pub struct Split { + pattern: SplitPattern, + #[serde(skip)] + regex: Regex, + behavior: SplitDelimiterBehavior, + invert: bool, +} + +impl Clone for Split { + fn clone(&self) -> Self { + Split::new(self.pattern.clone(), self.behavior, self.invert).unwrap() + } +} + +impl PartialEq for Split { + fn eq(&self, other: &Split) -> bool { + self.pattern == other.pattern + && self.behavior == other.behavior + && self.invert == other.invert + } +} + +impl Split { + pub fn new>( + pattern: I, + behavior: SplitDelimiterBehavior, + invert: bool, + ) -> Result { + let pattern: SplitPattern = pattern.into(); + let regex = match &pattern { + SplitPattern::String(s) => Regex::new(®ex::escape(s))?, + SplitPattern::Regex(r) => Regex::new(r)?, + }; + + Ok(Self { + pattern, + regex, + behavior, + invert, + }) + } +} + +impl PreTokenizer for Split { + fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { + if self.invert { + pretokenized.split(|_, normalized| normalized.split(Invert(&self.regex), self.behavior)) + } else { + pretokenized.split(|_, normalized| normalized.split(&self.regex, self.behavior)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{OffsetReferential, OffsetType, PreTokenizer}; + use SplitDelimiterBehavior::*; + + #[test] + fn basic() { + let tests = vec![ + ( + Removed, + "How are you doing?", + vec![ + ("How", (0, 3)), + ("are", (4, 7)), + ("you", (8, 11)), + ("doing", (12, 17)), + ("?", (17, 18)), + ], + ), + ( + Isolated, + "How are you doing?", + vec![ + ("How", (0, 3)), + (" ", (3, 4)), + ("are", (4, 7)), + (" ", (7, 8)), + ("you", (8, 11)), + (" ", (11, 12)), + ("doing", (12, 17)), + ("?", (17, 18)), + ], + ), + ( + MergedWithPrevious, + "How are you doing?", + vec![ + ("How ", (0, 4)), + ("are ", (4, 8)), + ("you ", (8, 12)), + ("doing", (12, 17)), + ("?", (17, 18)), + ], + ), + ( + MergedWithNext, + "How are you doing?", + vec![ + ("How", (0, 3)), + (" are", (3, 7)), + (" you", (7, 11)), + (" doing", (11, 17)), + ("?", (17, 18)), + ], + ), + ( + Contiguous, + "How are you doing?", + vec![ + ("How", (0, 3)), + (" ", (3, 4)), + ("are", (4, 7)), + (" ", (7, 8)), + ("you", (8, 11)), + (" ", (11, 12)), + ("doing?", (12, 18)), + ], + ), + ]; + + // use whitespace regex + let regex = SplitPattern::Regex(r"\w+|[^\w\s]+".into()); + + for (behavior, s, res) in tests { + let mut pretokenized = PreTokenizedString::from(s); + let pretok = Split::new(regex.clone(), behavior, true).unwrap(); + pretok.pre_tokenize(&mut pretokenized).unwrap(); + assert_eq!( + pretokenized + .get_splits(OffsetReferential::Original, OffsetType::Byte) + .into_iter() + .map(|(s, o, _)| (s, o)) + .collect::>(), + res + ); + } + } + + #[test] + fn regex_string() { + let mut pretok_str_for_regex = PreTokenizedString::from("Hey, man!"); + let mut pretok_str_for_string = pretok_str_for_regex.clone(); + + // pre-tokenizer splits on " " - one from Regex, one from string + let pretokenizer_regex = Split::new( + SplitPattern::Regex(r"\s+".into()), + SplitDelimiterBehavior::Removed, + false, + ) + .unwrap(); + let pretokenizer_string = Split::new(" ", SplitDelimiterBehavior::Removed, false).unwrap(); + + pretokenizer_regex + .pre_tokenize(&mut pretok_str_for_regex) + .unwrap(); + pretokenizer_string + .pre_tokenize(&mut pretok_str_for_string) + .unwrap(); + + assert_eq!(pretok_str_for_regex, pretok_str_for_string); + } + + #[test] + fn invert() { + let mut pretok_str = PreTokenizedString::from("Hello Hello Hello"); + let mut pretok_str_for_invert = pretok_str.clone(); + + // one pre-tokenizer splits on " " - one splits inverted on "Hello" + let pretokenizer = Split::new(" ", SplitDelimiterBehavior::Removed, false).unwrap(); + let pretokenizer_invert = + Split::new("Hello", SplitDelimiterBehavior::Removed, true).unwrap(); + + pretokenizer.pre_tokenize(&mut pretok_str).unwrap(); + pretokenizer_invert + .pre_tokenize(&mut pretok_str_for_invert) + .unwrap(); + + assert_eq!(pretok_str, pretok_str_for_invert); + } + + #[test] + fn serialization() { + use SplitDelimiterBehavior::*; + + let split = Split::new("Hello", Removed, true).unwrap(); + let split_s = + r#"{"type":"Split","pattern":{"String":"Hello"},"behavior":"Removed","invert":true}"#; + assert_eq!(serde_json::to_string(&split).unwrap(), split_s); + assert_eq!(serde_json::from_str::(split_s).unwrap(), split); + + let split = Split::new(SplitPattern::Regex(r"\s+".into()), Isolated, false).unwrap(); + let split_s = + r#"{"type":"Split","pattern":{"Regex":"\\s+"},"behavior":"Isolated","invert":false}"#; + assert_eq!(serde_json::to_string(&split).unwrap(), split_s); + assert_eq!(serde_json::from_str::(split_s).unwrap(), split); + } +} diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index fa9e37c5..18a584ba 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -3,6 +3,8 @@ use crate::{Offsets, Result}; use std::ops::{Bound, RangeBounds}; use unicode_normalization_alignments::UnicodeNormalization; +use serde::{Deserialize, Serialize}; + /// Add or Substract a signed isize on a usize. Makes sure of avoiding /// any substraction overflow, flooring at 0. macro_rules! apply_signed { @@ -89,7 +91,7 @@ where /// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]` /// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]` /// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]` -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] pub enum SplitDelimiterBehavior { Removed, Isolated,