mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Split Pre-Tokenizer (#542)
* start playing around * make a first version * refactor * apply make format * add python bindings * add some python binding tests * correct pre-tokenizers * update auto-generated bindings * lint python bindings * add code node * add split to docs * refactor python binding a bit * cargo fmt * clippy and fmt in node * quick updates and fixes * Oops * Update node typings * Update changelog Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
committed by
GitHub
parent
58e1d8de67
commit
dd399d2ad0
18
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
18
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
@ -39,6 +39,24 @@ export function whitespacePreTokenizer(): PreTokenizer;
|
|||||||
*/
|
*/
|
||||||
export function whitespaceSplitPreTokenizer(): PreTokenizer;
|
export function whitespaceSplitPreTokenizer(): PreTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a Split PreTokenizer
|
||||||
|
* This versatile pre-tokenizer splits using the provided pattern and
|
||||||
|
* according to the provided behavior. The pattern can be inverted by
|
||||||
|
* making use of the invert flag.
|
||||||
|
*
|
||||||
|
* @param [pattern] A pattern used to split the string. Usually a string or a Regex.
|
||||||
|
* @param [behavior] The behavior to use when splitting.
|
||||||
|
* Choices: "removed", "isolated", "mergedWithPrevious", "mergedWithNext",
|
||||||
|
* "contiguous".
|
||||||
|
* @param [invert=false] Whether to invert the pattern.
|
||||||
|
*/
|
||||||
|
export function splitPreTokenizer(
|
||||||
|
pattern?: string,
|
||||||
|
behavior?: string,
|
||||||
|
invert?: boolean
|
||||||
|
): PreTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a new Bert PreTokenizer.
|
* Returns a new Bert PreTokenizer.
|
||||||
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
* This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||||
|
@ -11,4 +11,5 @@ module.exports = {
|
|||||||
punctuationPreTokenizer: native.pre_tokenizers_Punctuation,
|
punctuationPreTokenizer: native.pre_tokenizers_Punctuation,
|
||||||
sequencePreTokenizer: native.pre_tokenizers_Sequence,
|
sequencePreTokenizer: native.pre_tokenizers_Sequence,
|
||||||
digitsPreTokenizer: native.pre_tokenizers_Digits,
|
digitsPreTokenizer: native.pre_tokenizers_Digits,
|
||||||
|
splitPreTokenizer: native.pre_tokenizers_Split,
|
||||||
};
|
};
|
||||||
|
@ -3,6 +3,7 @@ import {
|
|||||||
metaspacePreTokenizer,
|
metaspacePreTokenizer,
|
||||||
punctuationPreTokenizer,
|
punctuationPreTokenizer,
|
||||||
sequencePreTokenizer,
|
sequencePreTokenizer,
|
||||||
|
splitPreTokenizer,
|
||||||
whitespaceSplitPreTokenizer,
|
whitespaceSplitPreTokenizer,
|
||||||
} from "./pre-tokenizers";
|
} from "./pre-tokenizers";
|
||||||
|
|
||||||
@ -44,6 +45,13 @@ describe("punctuationPreTokenizer", () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("splitPreTokenizer", () => {
|
||||||
|
it("instantiates correctly with invert parameter", () => {
|
||||||
|
const processor = splitPreTokenizer(" ", "mergedWithPrevious", false);
|
||||||
|
expect(processor.constructor.name).toEqual("PreTokenizer");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe("sequencePreTokenizer", () => {
|
describe("sequencePreTokenizer", () => {
|
||||||
it("instantiates correctly", () => {
|
it("instantiates correctly", () => {
|
||||||
const punctuation = punctuationPreTokenizer();
|
const punctuation = punctuationPreTokenizer();
|
||||||
|
@ -5,9 +5,38 @@ use neon::prelude::*;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use serde::{ser::SerializeStruct, Serialize, Serializer};
|
use serde::{ser::SerializeStruct, Serialize, Serializer};
|
||||||
|
use tk::normalizer::SplitDelimiterBehavior;
|
||||||
use tk::pre_tokenizers::PreTokenizerWrapper;
|
use tk::pre_tokenizers::PreTokenizerWrapper;
|
||||||
use tk::PreTokenizedString;
|
use tk::PreTokenizedString;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct JsSplitDelimiterBehavior(SplitDelimiterBehavior);
|
||||||
|
|
||||||
|
impl FromJsValue for JsSplitDelimiterBehavior {
|
||||||
|
fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, _cx: &mut C) -> LibResult<Self> {
|
||||||
|
let s = from.downcast::<JsString>()?.value();
|
||||||
|
|
||||||
|
Ok(Self(match s.as_ref() {
|
||||||
|
"removed" => Ok(SplitDelimiterBehavior::Removed),
|
||||||
|
"isolated" => Ok(SplitDelimiterBehavior::Isolated),
|
||||||
|
"mergedWithPrevious" => Ok(SplitDelimiterBehavior::MergedWithPrevious),
|
||||||
|
"mergedWithNext" => Ok(SplitDelimiterBehavior::MergedWithNext),
|
||||||
|
"contiguous" => Ok(SplitDelimiterBehavior::Contiguous),
|
||||||
|
_ => Err(Error(
|
||||||
|
"Wrong value for SplitDelimiterBehavior, expected one of: \
|
||||||
|
`removed, isolated, mergedWithPrevious, mergedWithNext, contiguous`"
|
||||||
|
.into(),
|
||||||
|
)),
|
||||||
|
}?))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'s> From<JsSplitDelimiterBehavior> for SplitDelimiterBehavior {
|
||||||
|
fn from(v: JsSplitDelimiterBehavior) -> Self {
|
||||||
|
v.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize)]
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub enum JsPreTokenizerWrapper {
|
pub enum JsPreTokenizerWrapper {
|
||||||
@ -156,6 +185,22 @@ fn metaspace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
|||||||
Ok(pretok)
|
Ok(pretok)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// split(invert: bool = false)
|
||||||
|
fn split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||||
|
let pattern: String = cx.extract::<String>(0)?;
|
||||||
|
let behavior: JsSplitDelimiterBehavior = cx.extract::<JsSplitDelimiterBehavior>(1)?;
|
||||||
|
let invert: bool = cx.extract_opt::<bool>(2)?.unwrap_or(false);
|
||||||
|
|
||||||
|
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||||
|
let guard = cx.lock();
|
||||||
|
pretok.borrow_mut(&guard).pretok = Some(
|
||||||
|
tk::pre_tokenizers::split::Split::new(pattern, behavior.into(), invert)
|
||||||
|
.map_err(|e| Error(e.to_string()))?
|
||||||
|
.into(),
|
||||||
|
);
|
||||||
|
Ok(pretok)
|
||||||
|
}
|
||||||
|
|
||||||
/// punctuation()
|
/// punctuation()
|
||||||
fn punctuation(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
fn punctuation(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||||
@ -231,6 +276,7 @@ pub fn register(m: &mut ModuleContext, prefix: &str) -> NeonResult<()> {
|
|||||||
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
|
m.export_function(&format!("{}_WhitespaceSplit", prefix), whitespace_split)?;
|
||||||
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
m.export_function(&format!("{}_BertPreTokenizer", prefix), bert_pre_tokenizer)?;
|
||||||
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
m.export_function(&format!("{}_Metaspace", prefix), metaspace)?;
|
||||||
|
m.export_function(&format!("{}_Split", prefix), split)?;
|
||||||
m.export_function(
|
m.export_function(
|
||||||
&format!("{}_CharDelimiterSplit", prefix),
|
&format!("{}_CharDelimiterSplit", prefix),
|
||||||
char_delimiter_split,
|
char_delimiter_split,
|
||||||
|
@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- [#542]: Add Split pre-tokenizer to easily split using a pattern
|
||||||
|
|
||||||
## [0.9.4]
|
## [0.9.4]
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
@ -270,6 +275,7 @@ delimiter (Works like `.split(delimiter)`)
|
|||||||
- Fix a bug that was causing crashes in Python 3.5
|
- Fix a bug that was causing crashes in Python 3.5
|
||||||
|
|
||||||
|
|
||||||
|
[#542]: https://github.com/huggingface/tokenizers/pull/542
|
||||||
[#506]: https://github.com/huggingface/tokenizers/pull/506
|
[#506]: https://github.com/huggingface/tokenizers/pull/506
|
||||||
[#500]: https://github.com/huggingface/tokenizers/pull/500
|
[#500]: https://github.com/huggingface/tokenizers/pull/500
|
||||||
[#498]: https://github.com/huggingface/tokenizers/pull/498
|
[#498]: https://github.com/huggingface/tokenizers/pull/498
|
||||||
|
@ -9,6 +9,7 @@ Digits = pre_tokenizers.Digits
|
|||||||
Metaspace = pre_tokenizers.Metaspace
|
Metaspace = pre_tokenizers.Metaspace
|
||||||
Punctuation = pre_tokenizers.Punctuation
|
Punctuation = pre_tokenizers.Punctuation
|
||||||
Sequence = pre_tokenizers.Sequence
|
Sequence = pre_tokenizers.Sequence
|
||||||
|
Split = pre_tokenizers.Split
|
||||||
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
||||||
Whitespace = pre_tokenizers.Whitespace
|
Whitespace = pre_tokenizers.Whitespace
|
||||||
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
WhitespaceSplit = pre_tokenizers.WhitespaceSplit
|
||||||
|
@ -392,6 +392,40 @@ class Sequence(PreTokenizer):
|
|||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class Split(PreTokenizer):
|
||||||
|
"""
|
||||||
|
Split PreTokenizer
|
||||||
|
|
||||||
|
This versatile pre-tokenizer splits using the provided pattern and
|
||||||
|
according to the provided behavior. The pattern can be inverted by
|
||||||
|
making use of the invert flag.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
||||||
|
A pattern used to split the string. Usually a string or a Regex
|
||||||
|
|
||||||
|
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||||
|
The behavior to use when splitting.
|
||||||
|
Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||||
|
"contiguous"
|
||||||
|
|
||||||
|
invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
Whether to invert the pattern.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, pattern, behavior, invert=False):
|
||||||
|
pass
|
||||||
|
def pre_tokenize(self, pretok):
|
||||||
|
"""
|
||||||
|
Pre tokenize the given PreTokenizedString in-place
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
def pre_tokenize_str(self, sequence):
|
||||||
|
"""
|
||||||
|
Pre tokenize the given sequence
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
class UnicodeScripts(PreTokenizer):
|
class UnicodeScripts(PreTokenizer):
|
||||||
"""
|
"""
|
||||||
This pre-tokenizer splits on characters that belong to different language family
|
This pre-tokenizer splits on characters that belong to different language family
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
[build-system]
|
[build-system]
|
||||||
requires = ["setuptools", "wheel", "setuptools-rust"]
|
requires = ["setuptools", "wheel", "setuptools-rust"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[tool.black]
|
[tool.black]
|
||||||
target-version = ['py35']
|
target-version = ['py35']
|
||||||
|
@ -67,6 +67,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<pre_tokenizers::PyByteLevel>()?;
|
m.add_class::<pre_tokenizers::PyByteLevel>()?;
|
||||||
m.add_class::<pre_tokenizers::PyWhitespace>()?;
|
m.add_class::<pre_tokenizers::PyWhitespace>()?;
|
||||||
m.add_class::<pre_tokenizers::PyWhitespaceSplit>()?;
|
m.add_class::<pre_tokenizers::PyWhitespaceSplit>()?;
|
||||||
|
m.add_class::<pre_tokenizers::PySplit>()?;
|
||||||
m.add_class::<pre_tokenizers::PyBertPreTokenizer>()?;
|
m.add_class::<pre_tokenizers::PyBertPreTokenizer>()?;
|
||||||
m.add_class::<pre_tokenizers::PyMetaspace>()?;
|
m.add_class::<pre_tokenizers::PyMetaspace>()?;
|
||||||
m.add_class::<pre_tokenizers::PyCharDelimiterSplit>()?;
|
m.add_class::<pre_tokenizers::PyCharDelimiterSplit>()?;
|
||||||
|
@ -12,6 +12,7 @@ use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
|||||||
use tk::pre_tokenizers::digits::Digits;
|
use tk::pre_tokenizers::digits::Digits;
|
||||||
use tk::pre_tokenizers::metaspace::Metaspace;
|
use tk::pre_tokenizers::metaspace::Metaspace;
|
||||||
use tk::pre_tokenizers::punctuation::Punctuation;
|
use tk::pre_tokenizers::punctuation::Punctuation;
|
||||||
|
use tk::pre_tokenizers::split::Split;
|
||||||
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
||||||
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
||||||
use tk::pre_tokenizers::PreTokenizerWrapper;
|
use tk::pre_tokenizers::PreTokenizerWrapper;
|
||||||
@ -53,6 +54,7 @@ impl PyPreTokenizer {
|
|||||||
PreTokenizerWrapper::Whitespace(_) => {
|
PreTokenizerWrapper::Whitespace(_) => {
|
||||||
Py::new(py, (PyWhitespace {}, base))?.into_py(py)
|
Py::new(py, (PyWhitespace {}, base))?.into_py(py)
|
||||||
}
|
}
|
||||||
|
PreTokenizerWrapper::Split(_) => Py::new(py, (PySplit {}, base))?.into_py(py),
|
||||||
PreTokenizerWrapper::Punctuation(_) => {
|
PreTokenizerWrapper::Punctuation(_) => {
|
||||||
Py::new(py, (PyPunctuation {}, base))?.into_py(py)
|
Py::new(py, (PyPunctuation {}, base))?.into_py(py)
|
||||||
}
|
}
|
||||||
@ -238,6 +240,48 @@ impl PyWhitespaceSplit {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Split PreTokenizer
|
||||||
|
///
|
||||||
|
/// This versatile pre-tokenizer splits using the provided pattern and
|
||||||
|
/// according to the provided behavior. The pattern can be inverted by
|
||||||
|
/// making use of the invert flag.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// pattern (:obj:`str` or :class:`~tokenizers.Regex`):
|
||||||
|
/// A pattern used to split the string. Usually a string or a Regex
|
||||||
|
///
|
||||||
|
/// behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||||
|
/// The behavior to use when splitting.
|
||||||
|
/// Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",
|
||||||
|
/// "contiguous"
|
||||||
|
///
|
||||||
|
/// invert (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||||
|
/// Whether to invert the pattern.
|
||||||
|
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Split)]
|
||||||
|
#[text_signature = "(self, pattern, behavior, invert=False)"]
|
||||||
|
pub struct PySplit {}
|
||||||
|
#[pymethods]
|
||||||
|
impl PySplit {
|
||||||
|
#[new]
|
||||||
|
#[args(invert = false)]
|
||||||
|
fn new(
|
||||||
|
pattern: PyPattern,
|
||||||
|
behavior: PySplitDelimiterBehavior,
|
||||||
|
invert: bool,
|
||||||
|
) -> PyResult<(Self, PyPreTokenizer)> {
|
||||||
|
Ok((
|
||||||
|
PySplit {},
|
||||||
|
ToPyResult(Split::new(pattern, behavior.into(), invert))
|
||||||
|
.into_py()?
|
||||||
|
.into(),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||||
|
Ok(PyTuple::new(py, &[" ", "removed"]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
/// This pre-tokenizer simply splits on the provided char. Works like `.split(delimiter)`
|
||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
|
@ -8,6 +8,7 @@ use pyo3::{PyMappingProtocol, PyObjectProtocol};
|
|||||||
use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehavior};
|
use tk::normalizer::{char_to_bytes, NormalizedString, Range, SplitDelimiterBehavior};
|
||||||
use tk::pattern::Pattern;
|
use tk::pattern::Pattern;
|
||||||
|
|
||||||
|
/// Represents a Pattern as used by `NormalizedString`
|
||||||
#[derive(Clone, FromPyObject)]
|
#[derive(Clone, FromPyObject)]
|
||||||
pub enum PyPattern<'p> {
|
pub enum PyPattern<'p> {
|
||||||
#[pyo3(annotation = "str")]
|
#[pyo3(annotation = "str")]
|
||||||
@ -44,6 +45,15 @@ impl From<PyPattern<'_>> for tk::normalizers::replace::ReplacePattern {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<PyPattern<'_>> for tk::pre_tokenizers::split::SplitPattern {
|
||||||
|
fn from(pattern: PyPattern<'_>) -> Self {
|
||||||
|
match pattern {
|
||||||
|
PyPattern::Str(s) => Self::String(s.to_owned()),
|
||||||
|
PyPattern::Regex(r) => Python::with_gil(|py| Self::Regex(r.borrow(py).pattern.clone())),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, FromPyObject)]
|
#[derive(Debug, Clone, FromPyObject)]
|
||||||
pub enum PyRange<'s> {
|
pub enum PyRange<'s> {
|
||||||
#[pyo3(annotation = "int")]
|
#[pyo3(annotation = "int")]
|
||||||
|
@ -13,6 +13,7 @@ from tokenizers.pre_tokenizers import (
|
|||||||
Sequence,
|
Sequence,
|
||||||
Digits,
|
Digits,
|
||||||
UnicodeScripts,
|
UnicodeScripts,
|
||||||
|
Split,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -30,6 +31,22 @@ class TestByteLevel:
|
|||||||
assert len(ByteLevel.alphabet()) == 256
|
assert len(ByteLevel.alphabet()) == 256
|
||||||
|
|
||||||
|
|
||||||
|
class TestSplit:
|
||||||
|
def test_instantiate(self):
|
||||||
|
pre_tokenizer = Split(pattern=" ", behavior="removed")
|
||||||
|
assert pre_tokenizer is not None
|
||||||
|
assert isinstance(pre_tokenizer, PreTokenizer)
|
||||||
|
assert isinstance(pre_tokenizer, Split)
|
||||||
|
assert isinstance(pickle.loads(pickle.dumps(Split(" ", "removed"))), Split)
|
||||||
|
|
||||||
|
# test with invert=True
|
||||||
|
pre_tokenizer_with_invert = Split(pattern=" ", behavior="isolated", invert=True)
|
||||||
|
assert pre_tokenizer_with_invert is not None
|
||||||
|
assert isinstance(pre_tokenizer_with_invert, PreTokenizer)
|
||||||
|
assert isinstance(pre_tokenizer_with_invert, Split)
|
||||||
|
assert isinstance(pickle.loads(pickle.dumps(Split(" ", "removed", True))), Split)
|
||||||
|
|
||||||
|
|
||||||
class TestWhitespace:
|
class TestWhitespace:
|
||||||
def test_instantiate(self):
|
def test_instantiate(self):
|
||||||
assert Whitespace() is not None
|
assert Whitespace() is not None
|
||||||
|
@ -21,6 +21,16 @@ to customize its behavior. This page lists most provided components.
|
|||||||
``Sequence([NFKC(), Lowercase()])``
|
``Sequence([NFKC(), Lowercase()])``
|
||||||
PreTokenizer.Sequence
|
PreTokenizer.Sequence
|
||||||
``Sequence([Punctuation(), WhitespaceSplit()])``
|
``Sequence([Punctuation(), WhitespaceSplit()])``
|
||||||
|
SplitDelimiterBehavior.removed
|
||||||
|
:obj:`removed`
|
||||||
|
SplitDelimiterBehavior.isolated
|
||||||
|
:obj:`isolated`
|
||||||
|
SplitDelimiterBehavior.merged_with_previous
|
||||||
|
:obj:`merged_with_previous`
|
||||||
|
SplitDelimiterBehavior.merged_with_next
|
||||||
|
:obj:`merged_with_next`
|
||||||
|
SplitDelimiterBehavior.contiguous
|
||||||
|
:obj:`contiguous`
|
||||||
|
|
||||||
.. entities:: rust
|
.. entities:: rust
|
||||||
|
|
||||||
@ -36,6 +46,16 @@ to customize its behavior. This page lists most provided components.
|
|||||||
``Sequence::new(vec![NFKC, Lowercase])``
|
``Sequence::new(vec![NFKC, Lowercase])``
|
||||||
PreTokenizer.Sequence
|
PreTokenizer.Sequence
|
||||||
``Sequence::new(vec![Punctuation, WhitespaceSplit])``
|
``Sequence::new(vec![Punctuation, WhitespaceSplit])``
|
||||||
|
SplitDelimiterBehavior.removed
|
||||||
|
:obj:`Removed`
|
||||||
|
SplitDelimiterBehavior.isolated
|
||||||
|
:obj:`Isolated`
|
||||||
|
SplitDelimiterBehavior.merged_with_previous
|
||||||
|
:obj:`MergedWithPrevious`
|
||||||
|
SplitDelimiterBehavior.merged_with_next
|
||||||
|
:obj:`MergedWithNext`
|
||||||
|
SplitDelimiterBehavior.contiguous
|
||||||
|
:obj:`Contiguous`
|
||||||
|
|
||||||
.. entities:: node
|
.. entities:: node
|
||||||
|
|
||||||
@ -51,6 +71,16 @@ to customize its behavior. This page lists most provided components.
|
|||||||
..
|
..
|
||||||
PreTokenizer.Sequence
|
PreTokenizer.Sequence
|
||||||
..
|
..
|
||||||
|
SplitDelimiterBehavior.removed
|
||||||
|
:obj:`removed`
|
||||||
|
SplitDelimiterBehavior.isolated
|
||||||
|
:obj:`isolated`
|
||||||
|
SplitDelimiterBehavior.merged_with_previous
|
||||||
|
:obj:`mergedWithPrevious`
|
||||||
|
SplitDelimiterBehavior.merged_with_next
|
||||||
|
:obj:`mergedWithNext`
|
||||||
|
SplitDelimiterBehavior.contiguous
|
||||||
|
:obj:`contiguous`
|
||||||
|
|
||||||
Normalizers
|
Normalizers
|
||||||
----------------------------------------------------------------------------------------------------
|
----------------------------------------------------------------------------------------------------
|
||||||
@ -203,6 +233,27 @@ the ByteLevel)
|
|||||||
|
|
||||||
Output: ```"Hello", "123", "there"```
|
Output: ```"Hello", "123", "there"```
|
||||||
|
|
||||||
|
* - Split
|
||||||
|
- Versatile pre-tokenizer that splits on provided pattern and according to provided behavior.
|
||||||
|
The pattern can be inverted if necessary.
|
||||||
|
|
||||||
|
- pattern should be either a custom string or regexp.
|
||||||
|
- behavior should be one of:
|
||||||
|
|
||||||
|
* :entity:`SplitDelimiterBehavior.removed`
|
||||||
|
* :entity:`SplitDelimiterBehavior.isolated`
|
||||||
|
* :entity:`SplitDelimiterBehavior.merged_with_previous`
|
||||||
|
* :entity:`SplitDelimiterBehavior.merged_with_next`
|
||||||
|
* :entity:`SplitDelimiterBehavior.contiguous`
|
||||||
|
|
||||||
|
- invert should be a boolean flag.
|
||||||
|
|
||||||
|
- Example with `pattern` = :obj:`" "`, `behavior` = :obj:`"isolated"`, `invert` = :obj:`False`:
|
||||||
|
|
||||||
|
Input: ``"Hello, how are you?"``
|
||||||
|
|
||||||
|
Output: ```"Hello,", " ", "how", " ", "are", " ", "you?"```
|
||||||
|
|
||||||
* - Sequence
|
* - Sequence
|
||||||
- Lets you compose multiple ``PreTokenizer`` that will be run in the given order
|
- Lets you compose multiple ``PreTokenizer`` that will be run in the given order
|
||||||
- :entity:`PreTokenizer.Sequence`
|
- :entity:`PreTokenizer.Sequence`
|
||||||
|
@ -21,7 +21,7 @@ impl From<&str> for ReplacePattern {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We use this custom deserializer to provided the value for `regex` for `Replace`
|
/// We use this custom deserializer to provide the value for `regex` for `Replace`
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
#[serde(tag = "type")]
|
#[serde(tag = "type")]
|
||||||
|
@ -5,6 +5,7 @@ pub mod digits;
|
|||||||
pub mod metaspace;
|
pub mod metaspace;
|
||||||
pub mod punctuation;
|
pub mod punctuation;
|
||||||
pub mod sequence;
|
pub mod sequence;
|
||||||
|
pub mod split;
|
||||||
pub mod unicode_scripts;
|
pub mod unicode_scripts;
|
||||||
pub mod whitespace;
|
pub mod whitespace;
|
||||||
|
|
||||||
@ -17,6 +18,7 @@ use crate::pre_tokenizers::digits::Digits;
|
|||||||
use crate::pre_tokenizers::metaspace::Metaspace;
|
use crate::pre_tokenizers::metaspace::Metaspace;
|
||||||
use crate::pre_tokenizers::punctuation::Punctuation;
|
use crate::pre_tokenizers::punctuation::Punctuation;
|
||||||
use crate::pre_tokenizers::sequence::Sequence;
|
use crate::pre_tokenizers::sequence::Sequence;
|
||||||
|
use crate::pre_tokenizers::split::Split;
|
||||||
use crate::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
use crate::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
||||||
use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
||||||
use crate::{PreTokenizedString, PreTokenizer};
|
use crate::{PreTokenizedString, PreTokenizer};
|
||||||
@ -30,6 +32,7 @@ pub enum PreTokenizerWrapper {
|
|||||||
Metaspace(Metaspace),
|
Metaspace(Metaspace),
|
||||||
Whitespace(Whitespace),
|
Whitespace(Whitespace),
|
||||||
Sequence(Sequence),
|
Sequence(Sequence),
|
||||||
|
Split(Split),
|
||||||
Punctuation(Punctuation),
|
Punctuation(Punctuation),
|
||||||
WhitespaceSplit(WhitespaceSplit),
|
WhitespaceSplit(WhitespaceSplit),
|
||||||
Digits(Digits),
|
Digits(Digits),
|
||||||
@ -46,6 +49,7 @@ impl PreTokenizer for PreTokenizerWrapper {
|
|||||||
PreTokenizerWrapper::Whitespace(wspt) => wspt.pre_tokenize(normalized),
|
PreTokenizerWrapper::Whitespace(wspt) => wspt.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::Punctuation(tok) => tok.pre_tokenize(normalized),
|
PreTokenizerWrapper::Punctuation(tok) => tok.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
|
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
|
||||||
|
PreTokenizerWrapper::Split(tok) => tok.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
|
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized),
|
PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::UnicodeScripts(us) => us.pre_tokenize(normalized),
|
PreTokenizerWrapper::UnicodeScripts(us) => us.pre_tokenize(normalized),
|
||||||
@ -59,6 +63,7 @@ impl_enum_from!(CharDelimiterSplit, PreTokenizerWrapper, Delimiter);
|
|||||||
impl_enum_from!(Whitespace, PreTokenizerWrapper, Whitespace);
|
impl_enum_from!(Whitespace, PreTokenizerWrapper, Whitespace);
|
||||||
impl_enum_from!(Punctuation, PreTokenizerWrapper, Punctuation);
|
impl_enum_from!(Punctuation, PreTokenizerWrapper, Punctuation);
|
||||||
impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
|
impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
|
||||||
|
impl_enum_from!(Split, PreTokenizerWrapper, Split);
|
||||||
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
|
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
|
||||||
impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit);
|
impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit);
|
||||||
impl_enum_from!(Digits, PreTokenizerWrapper, Digits);
|
impl_enum_from!(Digits, PreTokenizerWrapper, Digits);
|
||||||
|
247
tokenizers/src/pre_tokenizers/split.rs
Normal file
247
tokenizers/src/pre_tokenizers/split.rs
Normal file
@ -0,0 +1,247 @@
|
|||||||
|
use onig::Regex;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::tokenizer::{
|
||||||
|
pattern::Invert, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Represents the different patterns that `Split` can use
|
||||||
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub enum SplitPattern {
|
||||||
|
String(String),
|
||||||
|
Regex(String),
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<String> for SplitPattern {
|
||||||
|
fn from(v: String) -> Self {
|
||||||
|
SplitPattern::String(v)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&str> for SplitPattern {
|
||||||
|
fn from(v: &str) -> Self {
|
||||||
|
SplitPattern::String(v.to_owned())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// We use this custom deserializer to provide the value for `regex` for `Split`
|
||||||
|
#[doc(hidden)]
|
||||||
|
#[derive(Deserialize)]
|
||||||
|
#[serde(tag = "type")]
|
||||||
|
struct SplitDeserializer {
|
||||||
|
pattern: SplitPattern,
|
||||||
|
behavior: SplitDelimiterBehavior,
|
||||||
|
invert: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl std::convert::TryFrom<SplitDeserializer> for Split {
|
||||||
|
type Error = Box<dyn std::error::Error + Send + Sync>;
|
||||||
|
|
||||||
|
fn try_from(v: SplitDeserializer) -> Result<Self> {
|
||||||
|
Split::new(v.pattern, v.behavior, v.invert)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
#[serde(tag = "type", try_from = "SplitDeserializer")]
|
||||||
|
pub struct Split {
|
||||||
|
pattern: SplitPattern,
|
||||||
|
#[serde(skip)]
|
||||||
|
regex: Regex,
|
||||||
|
behavior: SplitDelimiterBehavior,
|
||||||
|
invert: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for Split {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
Split::new(self.pattern.clone(), self.behavior, self.invert).unwrap()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for Split {
|
||||||
|
fn eq(&self, other: &Split) -> bool {
|
||||||
|
self.pattern == other.pattern
|
||||||
|
&& self.behavior == other.behavior
|
||||||
|
&& self.invert == other.invert
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Split {
|
||||||
|
pub fn new<I: Into<SplitPattern>>(
|
||||||
|
pattern: I,
|
||||||
|
behavior: SplitDelimiterBehavior,
|
||||||
|
invert: bool,
|
||||||
|
) -> Result<Self> {
|
||||||
|
let pattern: SplitPattern = pattern.into();
|
||||||
|
let regex = match &pattern {
|
||||||
|
SplitPattern::String(s) => Regex::new(®ex::escape(s))?,
|
||||||
|
SplitPattern::Regex(r) => Regex::new(r)?,
|
||||||
|
};
|
||||||
|
|
||||||
|
Ok(Self {
|
||||||
|
pattern,
|
||||||
|
regex,
|
||||||
|
behavior,
|
||||||
|
invert,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PreTokenizer for Split {
|
||||||
|
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||||
|
if self.invert {
|
||||||
|
pretokenized.split(|_, normalized| normalized.split(Invert(&self.regex), self.behavior))
|
||||||
|
} else {
|
||||||
|
pretokenized.split(|_, normalized| normalized.split(&self.regex, self.behavior))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::{OffsetReferential, OffsetType, PreTokenizer};
|
||||||
|
use SplitDelimiterBehavior::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn basic() {
|
||||||
|
let tests = vec![
|
||||||
|
(
|
||||||
|
Removed,
|
||||||
|
"How are you doing?",
|
||||||
|
vec![
|
||||||
|
("How", (0, 3)),
|
||||||
|
("are", (4, 7)),
|
||||||
|
("you", (8, 11)),
|
||||||
|
("doing", (12, 17)),
|
||||||
|
("?", (17, 18)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Isolated,
|
||||||
|
"How are you doing?",
|
||||||
|
vec![
|
||||||
|
("How", (0, 3)),
|
||||||
|
(" ", (3, 4)),
|
||||||
|
("are", (4, 7)),
|
||||||
|
(" ", (7, 8)),
|
||||||
|
("you", (8, 11)),
|
||||||
|
(" ", (11, 12)),
|
||||||
|
("doing", (12, 17)),
|
||||||
|
("?", (17, 18)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
MergedWithPrevious,
|
||||||
|
"How are you doing?",
|
||||||
|
vec![
|
||||||
|
("How ", (0, 4)),
|
||||||
|
("are ", (4, 8)),
|
||||||
|
("you ", (8, 12)),
|
||||||
|
("doing", (12, 17)),
|
||||||
|
("?", (17, 18)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
MergedWithNext,
|
||||||
|
"How are you doing?",
|
||||||
|
vec![
|
||||||
|
("How", (0, 3)),
|
||||||
|
(" are", (3, 7)),
|
||||||
|
(" you", (7, 11)),
|
||||||
|
(" doing", (11, 17)),
|
||||||
|
("?", (17, 18)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
Contiguous,
|
||||||
|
"How are you doing?",
|
||||||
|
vec![
|
||||||
|
("How", (0, 3)),
|
||||||
|
(" ", (3, 4)),
|
||||||
|
("are", (4, 7)),
|
||||||
|
(" ", (7, 8)),
|
||||||
|
("you", (8, 11)),
|
||||||
|
(" ", (11, 12)),
|
||||||
|
("doing?", (12, 18)),
|
||||||
|
],
|
||||||
|
),
|
||||||
|
];
|
||||||
|
|
||||||
|
// use whitespace regex
|
||||||
|
let regex = SplitPattern::Regex(r"\w+|[^\w\s]+".into());
|
||||||
|
|
||||||
|
for (behavior, s, res) in tests {
|
||||||
|
let mut pretokenized = PreTokenizedString::from(s);
|
||||||
|
let pretok = Split::new(regex.clone(), behavior, true).unwrap();
|
||||||
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
pretokenized
|
||||||
|
.get_splits(OffsetReferential::Original, OffsetType::Byte)
|
||||||
|
.into_iter()
|
||||||
|
.map(|(s, o, _)| (s, o))
|
||||||
|
.collect::<Vec<_>>(),
|
||||||
|
res
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn regex_string() {
|
||||||
|
let mut pretok_str_for_regex = PreTokenizedString::from("Hey, man!");
|
||||||
|
let mut pretok_str_for_string = pretok_str_for_regex.clone();
|
||||||
|
|
||||||
|
// pre-tokenizer splits on " " - one from Regex, one from string
|
||||||
|
let pretokenizer_regex = Split::new(
|
||||||
|
SplitPattern::Regex(r"\s+".into()),
|
||||||
|
SplitDelimiterBehavior::Removed,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
let pretokenizer_string = Split::new(" ", SplitDelimiterBehavior::Removed, false).unwrap();
|
||||||
|
|
||||||
|
pretokenizer_regex
|
||||||
|
.pre_tokenize(&mut pretok_str_for_regex)
|
||||||
|
.unwrap();
|
||||||
|
pretokenizer_string
|
||||||
|
.pre_tokenize(&mut pretok_str_for_string)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(pretok_str_for_regex, pretok_str_for_string);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn invert() {
|
||||||
|
let mut pretok_str = PreTokenizedString::from("Hello Hello Hello");
|
||||||
|
let mut pretok_str_for_invert = pretok_str.clone();
|
||||||
|
|
||||||
|
// one pre-tokenizer splits on " " - one splits inverted on "Hello"
|
||||||
|
let pretokenizer = Split::new(" ", SplitDelimiterBehavior::Removed, false).unwrap();
|
||||||
|
let pretokenizer_invert =
|
||||||
|
Split::new("Hello", SplitDelimiterBehavior::Removed, true).unwrap();
|
||||||
|
|
||||||
|
pretokenizer.pre_tokenize(&mut pretok_str).unwrap();
|
||||||
|
pretokenizer_invert
|
||||||
|
.pre_tokenize(&mut pretok_str_for_invert)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(pretok_str, pretok_str_for_invert);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn serialization() {
|
||||||
|
use SplitDelimiterBehavior::*;
|
||||||
|
|
||||||
|
let split = Split::new("Hello", Removed, true).unwrap();
|
||||||
|
let split_s =
|
||||||
|
r#"{"type":"Split","pattern":{"String":"Hello"},"behavior":"Removed","invert":true}"#;
|
||||||
|
assert_eq!(serde_json::to_string(&split).unwrap(), split_s);
|
||||||
|
assert_eq!(serde_json::from_str::<Split>(split_s).unwrap(), split);
|
||||||
|
|
||||||
|
let split = Split::new(SplitPattern::Regex(r"\s+".into()), Isolated, false).unwrap();
|
||||||
|
let split_s =
|
||||||
|
r#"{"type":"Split","pattern":{"Regex":"\\s+"},"behavior":"Isolated","invert":false}"#;
|
||||||
|
assert_eq!(serde_json::to_string(&split).unwrap(), split_s);
|
||||||
|
assert_eq!(serde_json::from_str::<Split>(split_s).unwrap(), split);
|
||||||
|
}
|
||||||
|
}
|
@ -3,6 +3,8 @@ use crate::{Offsets, Result};
|
|||||||
use std::ops::{Bound, RangeBounds};
|
use std::ops::{Bound, RangeBounds};
|
||||||
use unicode_normalization_alignments::UnicodeNormalization;
|
use unicode_normalization_alignments::UnicodeNormalization;
|
||||||
|
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
/// Add or Substract a signed isize on a usize. Makes sure of avoiding
|
/// Add or Substract a signed isize on a usize. Makes sure of avoiding
|
||||||
/// any substraction overflow, flooring at 0.
|
/// any substraction overflow, flooring at 0.
|
||||||
macro_rules! apply_signed {
|
macro_rules! apply_signed {
|
||||||
@ -89,7 +91,7 @@ where
|
|||||||
/// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]`
|
/// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]`
|
||||||
/// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]`
|
/// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]`
|
||||||
/// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
|
/// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||||
pub enum SplitDelimiterBehavior {
|
pub enum SplitDelimiterBehavior {
|
||||||
Removed,
|
Removed,
|
||||||
Isolated,
|
Isolated,
|
||||||
|
Reference in New Issue
Block a user