mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Split Pre-Tokenizer (#542)
* start playing around * make a first version * refactor * apply make format * add python bindings * add some python binding tests * correct pre-tokenizers * update auto-generated bindings * lint python bindings * add code node * add split to docs * refactor python binding a bit * cargo fmt * clippy and fmt in node * quick updates and fixes * Oops * Update node typings * Update changelog Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
committed by
GitHub
parent
58e1d8de67
commit
dd399d2ad0
18
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
18
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
@ -39,6 +39,24 @@ export function whitespacePreTokenizer(): PreTokenizer;
|
||||
*/
|
||||
export function whitespaceSplitPreTokenizer(): PreTokenizer;
|
||||
|
||||
/**
|
||||
* Returns a Split PreTokenizer
|
||||
* This versatile pre-tokenizer splits using the provided pattern and
|
||||
* according to the provided behavior. The pattern can be inverted by
|
||||
* making use of the invert flag.
|
||||
*
|
||||
* @param [pattern] A pattern used to split the string. Usually a string or a Regex.
|
||||
* @param [behavior] The behavior to use when splitting.
|
||||
* Choices: "removed", "isolated", "mergedWithPrevious", "mergedWithNext",
|
||||
* "contiguous".
|
||||
* @param [invert=false] Whether to invert the pattern.
|
||||
*/
|
||||
export function splitPreTokenizer(
|
||||
pattern?: string,
|
||||
behavior?: string,
|
||||
invert?: boolean
|
||||
): PreTokenizer;
|
||||
|
||||
/**
|
||||
* Returns a new Bert PreTokenizer.
|
||||
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
|
@ -11,4 +11,5 @@ module.exports = {
|
||||
punctuationPreTokenizer: native.pre_tokenizers_Punctuation,
|
||||
sequencePreTokenizer: native.pre_tokenizers_Sequence,
|
||||
digitsPreTokenizer: native.pre_tokenizers_Digits,
|
||||
splitPreTokenizer: native.pre_tokenizers_Split,
|
||||
};
|
||||
|
@ -3,6 +3,7 @@ import {
|
||||
metaspacePreTokenizer,
|
||||
punctuationPreTokenizer,
|
||||
sequencePreTokenizer,
|
||||
splitPreTokenizer,
|
||||
whitespaceSplitPreTokenizer,
|
||||
} from "./pre-tokenizers";
|
||||
|
||||
@ -44,6 +45,13 @@ describe("punctuationPreTokenizer", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("splitPreTokenizer", () => {
|
||||
it("instantiates correctly with invert parameter", () => {
|
||||
const processor = splitPreTokenizer(" ", "mergedWithPrevious", false);
|
||||
expect(processor.constructor.name).toEqual("PreTokenizer");
|
||||
});
|
||||
});
|
||||
|
||||
describe("sequencePreTokenizer", () => {
|
||||
it("instantiates correctly", () => {
|
||||
const punctuation = punctuationPreTokenizer();
|
||||
|
@ -5,9 +5,38 @@ use neon::prelude::*;
|
||||
use std::sync::Arc;
|
||||
|
||||
use serde::{ser::SerializeStruct, Serialize, Serializer};
|
||||
use tk::normalizer::SplitDelimiterBehavior;
|
||||
use tk::pre_tokenizers::PreTokenizerWrapper;
|
||||
use tk::PreTokenizedString;
|
||||
|
||||
#[derive(Clone)]
|
||||
struct JsSplitDelimiterBehavior(SplitDelimiterBehavior);
|
||||
|
||||
impl FromJsValue for JsSplitDelimiterBehavior {
|
||||
fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, _cx: &mut C) -> LibResult<Self> {
|
||||
let s = from.downcast::<JsString>()?.value();
|
||||
|
||||
Ok(Self(match s.as_ref() {
|
||||
"removed" => Ok(SplitDelimiterBehavior::Removed),
|
||||
"isolated" => Ok(SplitDelimiterBehavior::Isolated),
|
||||
"mergedWithPrevious" => Ok(SplitDelimiterBehavior::MergedWithPrevious),
|
||||
"mergedWithNext" => Ok(SplitDelimiterBehavior::MergedWithNext),
|
||||
"contiguous" => Ok(SplitDelimiterBehavior::Contiguous),
|
||||
_ => Err(Error(
|
||||
"Wrong value for SplitDelimiterBehavior, expected one of: \
|
||||
`removed, isolated, mergedWithPrevious, mergedWithNext, contiguous`"
|
||||
.into(),
|
||||
)),
|
||||
}?))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'s> From<JsSplitDelimiterBehavior> for SplitDelimiterBehavior {
|
||||
fn from(v: JsSplitDelimiterBehavior) -> Self {
|
||||
v.0
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum JsPreTokenizerWrapper {
|
||||
@ -156,6 +185,22 @@ fn metaspace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
/// split(invert: bool = false)
|
||||
fn split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let pattern: String = cx.extract::<String>(0)?;
|
||||
let behavior: JsSplitDelimiterBehavior = cx.extract::<JsSplitDelimiterBehavior>(1)?;
|
||||
let invert: bool = cx.extract_opt::<bool>(2)?.unwrap_or(false);
|
||||
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
pretok.borrow_mut(&guard).pretok = Some(
|
||||
tk::pre_tokenizers::split::Split::new(pattern, behavior.into(), invert)
|
||||
.map_err(|e| Error(e.to_string()))?
|
||||
.into(),
|
||||
);
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
/// punctuation()
|
||||
fn punctuation(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
@ -231,6 +276,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
||||
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
|
||||
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
||||
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
||||
m.export_function(&format!("{}_Split", prefix), split)?;
|
||||
m.export_function(
|
||||
&format!("{}_CharDelimiterSplit", prefix),
|
||||
char_delimiter_split,
|
||||
|
@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- [#542]: Add Split pre-tokenizer to easily split using a pattern
|
||||
|
||||
## [0.9.4]
|
||||
|
||||
### Fixed
|
||||
@ -270,6 +275,7 @@ delimiter (Works like `.split(delimiter)`)
|
||||
- Fix a bug that was causing crashes in Python 3.5
|
||||
|
||||
|
||||
[#542]: https://github.com/huggingface/tokenizers/pull/542
|
||||
[#506]: https://github.com/huggingface/tokenizers/pull/506
|
||||
[#500]: https://github.com/huggingface/tokenizers/pull/500
|
||||
[#498]: https://github.com/huggingface/tokenizers/pull/498
|
||||
|
@ -9,6 +9,7 @@ Digits = pre_tokenizers.Digits
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
Punctuation = pre_tokenizers.Punctuation
|
||||
Sequence = pre_tokenizers.Sequence
|
||||
Split = pre_tokenizers.Split
|
||||
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||
|
@ -392,6 +392,40 @@ class Sequence(PreTokenizer):
|
||||
"""
|
||||
pass
|
||||
|
||||
class Split(PreTokenizer):
|
||||
"""
|
||||
Split PreTokenizer
|
||||
|
||||
This versatile pre-tokenizer splits using the provided pattern and
|
||||
according to the provided behavior. The pattern can be inverted by
|
||||
making use of the invert flag.
|
||||
|
||||
Args:
|
||||
pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
||||
A pattern used to split the string. Usually a string or a Regex
|
||||
|
||||
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
The behavior to use when splitting.
|
||||
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||
"contiguous"
|
||||
|
||||
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
Whether to invert the pattern.
|
||||
"""
|
||||
|
||||
def __init__(self, pattern, behavior, invert=False):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
"""
|
||||
pass
|
||||
|
||||
class UnicodeScripts(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer splits on characters that belong to different language family
|
||||
|
@ -1,5 +1,6 @@
|
||||
[build-system]
|
||||
requires = ["setuptools", "wheel", "setuptools-rust"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.black]
|
||||
target-version = ['py35']
|
||||
|
@ -67,6 +67,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<pre_tokenizers::PyByteLevel>()?;
|
||||
m.add_class::<pre_tokenizers::PyWhitespace>()?;
|
||||
m.add_class::<pre_tokenizers::PyWhitespaceSplit>()?;
|
||||
m.add_class::<pre_tokenizers::PySplit>()?;
|
||||
m.add_class::<pre_tokenizers::PyBertPreTokenizer>()?;
|
||||
m.add_class::<pre_tokenizers::PyMetaspace>()?;
|
||||
m.add_class::<pre_tokenizers::PyCharDelimiterSplit>()?;
|
||||
|
@ -12,6 +12,7 @@ use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||
use tk::pre_tokenizers::digits::Digits;
|
||||
use tk::pre_tokenizers::metaspace::Metaspace;
|
||||
use tk::pre_tokenizers::punctuation::Punctuation;
|
||||
use tk::pre_tokenizers::split::Split;
|
||||
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
||||
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
||||
use tk::pre_tokenizers::PreTokenizerWrapper;
|
||||
@ -53,6 +54,7 @@ impl PyPreTokenizer {
|
||||
PreTokenizerWrapper::Whitespace(_) => {
|
||||
Py::new(py, (PyWhitespace {}, base))?.into_py(py)
|
||||
}
|
||||
PreTokenizerWrapper::Split(_) => Py::new(py, (PySplit {}, base))?.into_py(py),
|
||||
PreTokenizerWrapper::Punctuation(_) => {
|
||||
Py::new(py, (PyPunctuation {}, base))?.into_py(py)
|
||||
}
|
||||
@ -238,6 +240,48 @@ impl PyWhitespaceSplit {
|
||||
}
|
||||
}
|
||||
|
||||
/// Split PreTokenizer
|
||||
///
|
||||
/// This versatile pre-tokenizer splits using the provided pattern and
|
||||
/// according to the provided behavior. The pattern can be inverted by
|
||||
/// making use of the invert flag.
|
||||
///
|
||||
/// Args:
|
||||
/// pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
||||
/// A pattern used to split the string. Usually a string or a Regex
|
||||
///
|
||||
/// behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
/// The behavior to use when splitting.
|
||||
/// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||
/// "contiguous"
|
||||
///
|
||||
/// invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
/// Whether to invert the pattern.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Split)]
|
||||
#[text_signature = "(self, pattern, behavior, invert=False)"]
|
||||
pub struct PySplit {}
|
||||
#[pymethods]
|
||||
impl PySplit {
|
||||
#[new]
|
||||
#[args(invert = false)]
|
||||
fn new(
|
||||
pattern: PyPattern,
|
||||
behavior: PySplitDelimiterBehavior,
|
||||
invert: bool,
|
||||
) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((
|
||||
PySplit {},
|
||||
ToPyResult(Split::new(pattern, behavior.into(), invert))
|
||||
.into_py()?
|
||||
.into(),
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[" ", "removed"]))
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||
///
|
||||
/// Args:
|
||||
|
@ -8,6 +8,7 @@ use pyo3::{PyMappingProtocol, PyObjectProtocol};
|
||||
use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehavior};
|
||||
use tk::pattern::Pattern;
|
||||
|
||||
/// Represents a Pattern as used by `NormalizedString`
|
||||
#[derive(Clone, FromPyObject)]
|
||||
pub enum PyPattern<'p> {
|
||||
#[pyo3(annotation = "str")]
|
||||
@ -44,6 +45,15 @@ impl From<PyPattern<'_>> for tk::normalizers::replace::ReplacePattern {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<PyPattern<'_>> for tk::pre_tokenizers::split::SplitPattern {
|
||||
fn from(pattern: PyPattern<'_>) -> Self {
|
||||
match pattern {
|
||||
PyPattern::Str(s) => Self::String(s.to_owned()),
|
||||
PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, FromPyObject)]
|
||||
pub enum PyRange<'s> {
|
||||
#[pyo3(annotation = "int")]
|
||||
|
@ -13,6 +13,7 @@ from tokenizers.pre_tokenizers import (
|
||||
Sequence,
|
||||
Digits,
|
||||
UnicodeScripts,
|
||||
Split,
|
||||
)
|
||||
|
||||
|
||||
@ -30,6 +31,22 @@ class TestByteLevel:
|
||||
assert len(ByteLevel.alphabet()) == 256
|
||||
|
||||
|
||||
class TestSplit:
|
||||
def test_instantiate(self):
|
||||
pre_tokenizer = Split(pattern=" ", behavior="removed")
|
||||
assert pre_tokenizer is not None
|
||||
assert isinstance(pre_tokenizer, PreTokenizer)
|
||||
assert isinstance(pre_tokenizer, Split)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Split(" ", "removed"))), Split)
|
||||
|
||||
# test with invert=True
|
||||
pre_tokenizer_with_invert = Split(pattern=" ", behavior="isolated", invert=True)
|
||||
assert pre_tokenizer_with_invert is not None
|
||||
assert isinstance(pre_tokenizer_with_invert, PreTokenizer)
|
||||
assert isinstance(pre_tokenizer_with_invert, Split)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Split(" ", "removed", True))), Split)
|
||||
|
||||
|
||||
class TestWhitespace:
|
||||
def test_instantiate(self):
|
||||
assert Whitespace() is not None
|
||||
|
@ -21,6 +21,16 @@ to customize its behavior. This page lists most provided components.
|
||||
``Sequence([NFKC(), Lowercase()])``
|
||||
PreTokenizer.Sequence
|
||||
``Sequence([Punctuation(), WhitespaceSplit()])``
|
||||
SplitDelimiterBehavior.removed
|
||||
:obj:`removed`
|
||||
SplitDelimiterBehavior.isolated
|
||||
:obj:`isolated`
|
||||
SplitDelimiterBehavior.merged_with_previous
|
||||
:obj:`merged_with_previous`
|
||||
SplitDelimiterBehavior.merged_with_next
|
||||
:obj:`merged_with_next`
|
||||
SplitDelimiterBehavior.contiguous
|
||||
:obj:`contiguous`
|
||||
|
||||
.. entities:: rust
|
||||
|
||||
@ -36,6 +46,16 @@ to customize its behavior. This page lists most provided components.
|
||||
``Sequence::new(vec![NFKC, Lowercase])``
|
||||
PreTokenizer.Sequence
|
||||
``Sequence::new(vec![Punctuation, WhitespaceSplit])``
|
||||
SplitDelimiterBehavior.removed
|
||||
:obj:`Removed`
|
||||
SplitDelimiterBehavior.isolated
|
||||
:obj:`Isolated`
|
||||
SplitDelimiterBehavior.merged_with_previous
|
||||
:obj:`MergedWithPrevious`
|
||||
SplitDelimiterBehavior.merged_with_next
|
||||
:obj:`MergedWithNext`
|
||||
SplitDelimiterBehavior.contiguous
|
||||
:obj:`Contiguous`
|
||||
|
||||
.. entities:: node
|
||||
|
||||
@ -51,6 +71,16 @@ to customize its behavior. This page lists most provided components.
|
||||
..
|
||||
PreTokenizer.Sequence
|
||||
..
|
||||
SplitDelimiterBehavior.removed
|
||||
:obj:`removed`
|
||||
SplitDelimiterBehavior.isolated
|
||||
:obj:`isolated`
|
||||
SplitDelimiterBehavior.merged_with_previous
|
||||
:obj:`mergedWithPrevious`
|
||||
SplitDelimiterBehavior.merged_with_next
|
||||
:obj:`mergedWithNext`
|
||||
SplitDelimiterBehavior.contiguous
|
||||
:obj:`contiguous`
|
||||
|
||||
Normalizers
|
||||
----------------------------------------------------------------------------------------------------
|
||||
@ -203,6 +233,27 @@ the ByteLevel)
|
||||
|
||||
Output: ```"Hello", "123", "there"```
|
||||
|
||||
* - Split
|
||||
- Versatile pre-tokenizer that splits on provided pattern and according to provided behavior.
|
||||
The pattern can be inverted if necessary.
|
||||
|
||||
- pattern should be either a custom string or regexp.
|
||||
- behavior should be one of:
|
||||
|
||||
* :entity:`SplitDelimiterBehavior.removed`
|
||||
* :entity:`SplitDelimiterBehavior.isolated`
|
||||
* :entity:`SplitDelimiterBehavior.merged_with_previous`
|
||||
* :entity:`SplitDelimiterBehavior.merged_with_next`
|
||||
* :entity:`SplitDelimiterBehavior.contiguous`
|
||||
|
||||
- invert should be a boolean flag.
|
||||
|
||||
- Example with `pattern` = :obj:`" "`, `behavior` = :obj:`"isolated"`, `invert` = :obj:`False`:
|
||||
|
||||
Input: ``"Hello, how are you?"``
|
||||
|
||||
Output: ```"Hello,", " ", "how", " ", "are", " ", "you?"```
|
||||
|
||||
* - Sequence
|
||||
- Lets you compose multiple ``PreTokenizer`` that will be run in the given order
|
||||
- :entity:`PreTokenizer.Sequence`
|
||||
|
@ -21,7 +21,7 @@ impl From<&str> for ReplacePattern {
|
||||
}
|
||||
}
|
||||
|
||||
/// We use this custom deserializer to provided the value for `regex` for `Replace`
|
||||
/// We use this custom deserializer to provide the value for `regex` for `Replace`
|
||||
#[doc(hidden)]
|
||||
#[derive(Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
|
@ -5,6 +5,7 @@ pub mod digits;
|
||||
pub mod metaspace;
|
||||
pub mod punctuation;
|
||||
pub mod sequence;
|
||||
pub mod split;
|
||||
pub mod unicode_scripts;
|
||||
pub mod whitespace;
|
||||
|
||||
@ -17,6 +18,7 @@ use crate::pre_tokenizers::digits::Digits;
|
||||
use crate::pre_tokenizers::metaspace::Metaspace;
|
||||
use crate::pre_tokenizers::punctuation::Punctuation;
|
||||
use crate::pre_tokenizers::sequence::Sequence;
|
||||
use crate::pre_tokenizers::split::Split;
|
||||
use crate::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
||||
use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
||||
use crate::{PreTokenizedString, PreTokenizer};
|
||||
@ -30,6 +32,7 @@ pub enum PreTokenizerWrapper {
|
||||
Metaspace(Metaspace),
|
||||
Whitespace(Whitespace),
|
||||
Sequence(Sequence),
|
||||
Split(Split),
|
||||
Punctuation(Punctuation),
|
||||
WhitespaceSplit(WhitespaceSplit),
|
||||
Digits(Digits),
|
||||
@ -46,6 +49,7 @@ impl PreTokenizer for PreTokenizerWrapper {
|
||||
PreTokenizerWrapper::Whitespace(wspt) => wspt.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Punctuation(tok) => tok.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Split(tok) => tok.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized),
|
||||
PreTokenizerWrapper::UnicodeScripts(us) => us.pre_tokenize(normalized),
|
||||
@ -59,6 +63,7 @@ impl_enum_from!(CharDelimiterSplit, PreTokenizerWrapper, Delimiter);
|
||||
impl_enum_from!(Whitespace, PreTokenizerWrapper, Whitespace);
|
||||
impl_enum_from!(Punctuation, PreTokenizerWrapper, Punctuation);
|
||||
impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
|
||||
impl_enum_from!(Split, PreTokenizerWrapper, Split);
|
||||
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
|
||||
impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit);
|
||||
impl_enum_from!(Digits, PreTokenizerWrapper, Digits);
|
||||
|
247
tokenizers/src/pre_tokenizers/split.rs
Normal file
247
tokenizers/src/pre_tokenizers/split.rs
Normal file
@ -0,0 +1,247 @@
|
||||
use onig::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::tokenizer::{
|
||||
pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
|
||||
};
|
||||
|
||||
/// Represents the different patterns that `Split` can use
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub enum SplitPattern {
|
||||
String(String),
|
||||
Regex(String),
|
||||
}
|
||||
|
||||
impl From<String> for SplitPattern {
|
||||
fn from(v: String) -> Self {
|
||||
SplitPattern::String(v)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<&str> for SplitPattern {
|
||||
fn from(v: &str) -> Self {
|
||||
SplitPattern::String(v.to_owned())
|
||||
}
|
||||
}
|
||||
|
||||
/// We use this custom deserializer to provide the value for `regex` for `Split`
|
||||
#[doc(hidden)]
|
||||
#[derive(Deserialize)]
|
||||
#[serde(tag = "type")]
|
||||
struct SplitDeserializer {
|
||||
pattern: SplitPattern,
|
||||
behavior: SplitDelimiterBehavior,
|
||||
invert: bool,
|
||||
}
|
||||
|
||||
impl std::convert::TryFrom<SplitDeserializer> for Split {
|
||||
type Error = Box<dyn std::error::Error + Send + Sync>;
|
||||
|
||||
fn try_from(v: SplitDeserializer) -> Result<Self> {
|
||||
Split::new(v.pattern, v.behavior, v.invert)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", try_from = "SplitDeserializer")]
|
||||
pub struct Split {
|
||||
pattern: SplitPattern,
|
||||
#[serde(skip)]
|
||||
regex: Regex,
|
||||
behavior: SplitDelimiterBehavior,
|
||||
invert: bool,
|
||||
}
|
||||
|
||||
impl Clone for Split {
|
||||
fn clone(&self) -> Self {
|
||||
Split::new(self.pattern.clone(), self.behavior, self.invert).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl PartialEq for Split {
|
||||
fn eq(&self, other: &Split) -> bool {
|
||||
self.pattern == other.pattern
|
||||
&& self.behavior == other.behavior
|
||||
&& self.invert == other.invert
|
||||
}
|
||||
}
|
||||
|
||||
impl Split {
|
||||
pub fn new<I: Into<SplitPattern>>(
|
||||
pattern: I,
|
||||
behavior: SplitDelimiterBehavior,
|
||||
invert: bool,
|
||||
) -> Result<Self> {
|
||||
let pattern: SplitPattern = pattern.into();
|
||||
let regex = match &pattern {
|
||||
SplitPattern::String(s) => Regex::new(®ex::escape(s))?,
|
||||
SplitPattern::Regex(r) => Regex::new(r)?,
|
||||
};
|
||||
|
||||
Ok(Self {
|
||||
pattern,
|
||||
regex,
|
||||
behavior,
|
||||
invert,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl PreTokenizer for Split {
|
||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||
if self.invert {
|
||||
pretokenized.split(|_, normalized| normalized.split(Invert(&self.regex), self.behavior))
|
||||
} else {
|
||||
pretokenized.split(|_, normalized| normalized.split(&self.regex, self.behavior))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::{OffsetReferential, OffsetType, PreTokenizer};
|
||||
use SplitDelimiterBehavior::*;
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let tests = vec![
|
||||
(
|
||||
Removed,
|
||||
"How are you doing?",
|
||||
vec![
|
||||
("How", (0, 3)),
|
||||
("are", (4, 7)),
|
||||
("you", (8, 11)),
|
||||
("doing", (12, 17)),
|
||||
("?", (17, 18)),
|
||||
],
|
||||
),
|
||||
(
|
||||
Isolated,
|
||||
"How are you doing?",
|
||||
vec![
|
||||
("How", (0, 3)),
|
||||
(" ", (3, 4)),
|
||||
("are", (4, 7)),
|
||||
(" ", (7, 8)),
|
||||
("you", (8, 11)),
|
||||
(" ", (11, 12)),
|
||||
("doing", (12, 17)),
|
||||
("?", (17, 18)),
|
||||
],
|
||||
),
|
||||
(
|
||||
MergedWithPrevious,
|
||||
"How are you doing?",
|
||||
vec![
|
||||
("How ", (0, 4)),
|
||||
("are ", (4, 8)),
|
||||
("you ", (8, 12)),
|
||||
("doing", (12, 17)),
|
||||
("?", (17, 18)),
|
||||
],
|
||||
),
|
||||
(
|
||||
MergedWithNext,
|
||||
"How are you doing?",
|
||||
vec![
|
||||
("How", (0, 3)),
|
||||
(" are", (3, 7)),
|
||||
(" you", (7, 11)),
|
||||
(" doing", (11, 17)),
|
||||
("?", (17, 18)),
|
||||
],
|
||||
),
|
||||
(
|
||||
Contiguous,
|
||||
"How are you doing?",
|
||||
vec![
|
||||
("How", (0, 3)),
|
||||
(" ", (3, 4)),
|
||||
("are", (4, 7)),
|
||||
(" ", (7, 8)),
|
||||
("you", (8, 11)),
|
||||
(" ", (11, 12)),
|
||||
("doing?", (12, 18)),
|
||||
],
|
||||
),
|
||||
];
|
||||
|
||||
// use whitespace regex
|
||||
let regex = SplitPattern::Regex(r"\w+|[^\w\s]+".into());
|
||||
|
||||
for (behavior, s, res) in tests {
|
||||
let mut pretokenized = PreTokenizedString::from(s);
|
||||
let pretok = Split::new(regex.clone(), behavior, true).unwrap();
|
||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||
assert_eq!(
|
||||
pretokenized
|
||||
.get_splits(OffsetReferential::Original, OffsetType::Byte)
|
||||
.into_iter()
|
||||
.map(|(s, o, _)| (s, o))
|
||||
.collect::<Vec<_>>(),
|
||||
res
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn regex_string() {
|
||||
let mut pretok_str_for_regex = PreTokenizedString::from("Hey, man!");
|
||||
let mut pretok_str_for_string = pretok_str_for_regex.clone();
|
||||
|
||||
// pre-tokenizer splits on " " - one from Regex, one from string
|
||||
let pretokenizer_regex = Split::new(
|
||||
SplitPattern::Regex(r"\s+".into()),
|
||||
SplitDelimiterBehavior::Removed,
|
||||
false,
|
||||
)
|
||||
.unwrap();
|
||||
let pretokenizer_string = Split::new(" ", SplitDelimiterBehavior::Removed, false).unwrap();
|
||||
|
||||
pretokenizer_regex
|
||||
.pre_tokenize(&mut pretok_str_for_regex)
|
||||
.unwrap();
|
||||
pretokenizer_string
|
||||
.pre_tokenize(&mut pretok_str_for_string)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(pretok_str_for_regex, pretok_str_for_string);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn invert() {
|
||||
let mut pretok_str = PreTokenizedString::from("Hello Hello Hello");
|
||||
let mut pretok_str_for_invert = pretok_str.clone();
|
||||
|
||||
// one pre-tokenizer splits on " " - one splits inverted on "Hello"
|
||||
let pretokenizer = Split::new(" ", SplitDelimiterBehavior::Removed, false).unwrap();
|
||||
let pretokenizer_invert =
|
||||
Split::new("Hello", SplitDelimiterBehavior::Removed, true).unwrap();
|
||||
|
||||
pretokenizer.pre_tokenize(&mut pretok_str).unwrap();
|
||||
pretokenizer_invert
|
||||
.pre_tokenize(&mut pretok_str_for_invert)
|
||||
.unwrap();
|
||||
|
||||
assert_eq!(pretok_str, pretok_str_for_invert);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serialization() {
|
||||
use SplitDelimiterBehavior::*;
|
||||
|
||||
let split = Split::new("Hello", Removed, true).unwrap();
|
||||
let split_s =
|
||||
r#"{"type":"Split","pattern":{"String":"Hello"},"behavior":"Removed","invert":true}"#;
|
||||
assert_eq!(serde_json::to_string(&split).unwrap(), split_s);
|
||||
assert_eq!(serde_json::from_str::<Split>(split_s).unwrap(), split);
|
||||
|
||||
let split = Split::new(SplitPattern::Regex(r"\s+".into()), Isolated, false).unwrap();
|
||||
let split_s =
|
||||
r#"{"type":"Split","pattern":{"Regex":"\\s+"},"behavior":"Isolated","invert":false}"#;
|
||||
assert_eq!(serde_json::to_string(&split).unwrap(), split_s);
|
||||
assert_eq!(serde_json::from_str::<Split>(split_s).unwrap(), split);
|
||||
}
|
||||
}
|
@ -3,6 +3,8 @@ use crate::{Offsets, Result};
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
use unicode_normalization_alignments::UnicodeNormalization;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Add or Substract a signed isize on a usize. Makes sure of avoiding
|
||||
/// any substraction overflow, flooring at 0.
|
||||
macro_rules! apply_signed {
|
||||
@ -89,7 +91,7 @@ where
|
||||
/// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]`
|
||||
/// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]`
|
||||
/// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||
pub enum SplitDelimiterBehavior {
|
||||
Removed,
|
||||
Isolated,
|
||||
|
Reference in New Issue
Block a user