mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Add SplitDelimiterBehavior to Punctuation constructor (#657)
Resolves: #642
This commit is contained in:
10
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
10
bindings/node/lib/bindings/pre-tokenizers.d.ts
vendored
@ -90,10 +90,14 @@ export function charDelimiterSplitPreTokenizer(delimiter: string): PreTokenizer;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a new Punctuation PreTokenizer.
|
* Returns a new Punctuation PreTokenizer.
|
||||||
* This pre-tokenizer splits tokens on punctuation.
|
* This pre-tokenizer splits tokens on punctuation according to the provided behavior.
|
||||||
* Each occurrence of a punctuation character will be treated separately.
|
* Each occurrence of a punctuation character is treated separately.
|
||||||
|
*
|
||||||
|
* @param [behavior="isolated"] The behavior to use when splitting.
|
||||||
|
* Choices: "removed", "isolated", "mergedWithPrevious", "mergedWithNext",
|
||||||
|
* "contiguous"
|
||||||
*/
|
*/
|
||||||
export function punctuationPreTokenizer(): PreTokenizer;
|
export function punctuationPreTokenizer(behavior?: string): PreTokenizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a new Sequence PreTokenizer.
|
* Returns a new Sequence PreTokenizer.
|
||||||
|
@ -43,6 +43,11 @@ describe("punctuationPreTokenizer", () => {
|
|||||||
const processor = punctuationPreTokenizer();
|
const processor = punctuationPreTokenizer();
|
||||||
expect(processor.constructor.name).toEqual("PreTokenizer");
|
expect(processor.constructor.name).toEqual("PreTokenizer");
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it("instantiates correctly with non-default split delimeter", () => {
|
||||||
|
const processor = punctuationPreTokenizer("removed");
|
||||||
|
expect(processor.constructor.name).toEqual("PreTokenizer");
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
describe("splitPreTokenizer", () => {
|
describe("splitPreTokenizer", () => {
|
||||||
|
@ -203,9 +203,15 @@ fn split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
|||||||
|
|
||||||
/// punctuation()
|
/// punctuation()
|
||||||
fn punctuation(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
fn punctuation(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||||
|
let behavior: JsSplitDelimiterBehavior = cx
|
||||||
|
.extract_opt::<JsSplitDelimiterBehavior>(0)?
|
||||||
|
.unwrap_or(JsSplitDelimiterBehavior(SplitDelimiterBehavior::Isolated));
|
||||||
|
|
||||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||||
let guard = cx.lock();
|
let guard = cx.lock();
|
||||||
pretok.borrow_mut(&guard).pretok = Some(tk::pre_tokenizers::punctuation::Punctuation.into());
|
pretok.borrow_mut(&guard).pretok =
|
||||||
|
Some(tk::pre_tokenizers::punctuation::Punctuation::new(behavior.into()).into());
|
||||||
|
|
||||||
Ok(pretok)
|
Ok(pretok)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
|
|||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- [#657]: Add SplitDelimiterBehavior customization to Punctuation constructor
|
||||||
|
|
||||||
## [0.10.3]
|
## [0.10.3]
|
||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
@ -326,6 +331,7 @@ delimiter (Works like `.split(delimiter)`)
|
|||||||
[#693]: https://github.com/huggingface/tokenizers/pull/693
|
[#693]: https://github.com/huggingface/tokenizers/pull/693
|
||||||
[#686]: https://github.com/huggingface/tokenizers/pull/686
|
[#686]: https://github.com/huggingface/tokenizers/pull/686
|
||||||
[#674]: https://github.com/huggingface/tokenizers/pull/674
|
[#674]: https://github.com/huggingface/tokenizers/pull/674
|
||||||
|
[#657]: https://github.com/huggingface/tokenizers/pull/657
|
||||||
[#656]: https://github.com/huggingface/tokenizers/pull/656
|
[#656]: https://github.com/huggingface/tokenizers/pull/656
|
||||||
[#652]: https://github.com/huggingface/tokenizers/pull/652
|
[#652]: https://github.com/huggingface/tokenizers/pull/652
|
||||||
[#621]: https://github.com/huggingface/tokenizers/pull/621
|
[#621]: https://github.com/huggingface/tokenizers/pull/621
|
||||||
|
@ -308,10 +308,16 @@ class Metaspace(PreTokenizer):
|
|||||||
|
|
||||||
class Punctuation(PreTokenizer):
|
class Punctuation(PreTokenizer):
|
||||||
"""
|
"""
|
||||||
This pre-tokenizer simply splits on punctuation as individual characters.`
|
This pre-tokenizer simply splits on punctuation as individual characters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||||
|
The behavior to use when splitting.
|
||||||
|
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
||||||
|
"contiguous"
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, behavior="isolated"):
|
||||||
pass
|
pass
|
||||||
def pre_tokenize(self, pretok):
|
def pre_tokenize(self, pretok):
|
||||||
"""
|
"""
|
||||||
|
@ -6,6 +6,7 @@ use pyo3::types::*;
|
|||||||
use serde::ser::SerializeStruct;
|
use serde::ser::SerializeStruct;
|
||||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||||
|
|
||||||
|
use tk::normalizer::SplitDelimiterBehavior;
|
||||||
use tk::pre_tokenizers::bert::BertPreTokenizer;
|
use tk::pre_tokenizers::bert::BertPreTokenizer;
|
||||||
use tk::pre_tokenizers::byte_level::ByteLevel;
|
use tk::pre_tokenizers::byte_level::ByteLevel;
|
||||||
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||||
@ -384,15 +385,22 @@ impl PyBertPreTokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This pre-tokenizer simply splits on punctuation as individual characters.`
|
/// This pre-tokenizer simply splits on punctuation as individual characters.
|
||||||
|
///
|
||||||
|
/// Args:
|
||||||
|
/// behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||||
|
/// The behavior to use when splitting.
|
||||||
|
/// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
||||||
|
/// "contiguous"
|
||||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
|
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
|
||||||
#[text_signature = "(self)"]
|
#[text_signature = "(self, behavior=\"isolated\")"]
|
||||||
pub struct PyPunctuation {}
|
pub struct PyPunctuation {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyPunctuation {
|
impl PyPunctuation {
|
||||||
#[new]
|
#[new]
|
||||||
fn new() -> (Self, PyPreTokenizer) {
|
#[args(behavior = "PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)")]
|
||||||
(PyPunctuation {}, Punctuation.into())
|
fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
|
||||||
|
(PyPunctuation {}, Punctuation::new(behavior.into()).into())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -92,7 +92,7 @@ impl PyRange<'_> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct PySplitDelimiterBehavior(SplitDelimiterBehavior);
|
pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior);
|
||||||
|
|
||||||
impl FromPyObject<'_> for PySplitDelimiterBehavior {
|
impl FromPyObject<'_> for PySplitDelimiterBehavior {
|
||||||
fn extract(obj: &PyAny) -> PyResult<Self> {
|
fn extract(obj: &PyAny) -> PyResult<Self> {
|
||||||
|
@ -132,6 +132,7 @@ class TestCharDelimiterSplit:
|
|||||||
class TestPunctuation:
|
class TestPunctuation:
|
||||||
def test_instantiate(self):
|
def test_instantiate(self):
|
||||||
assert Punctuation() is not None
|
assert Punctuation() is not None
|
||||||
|
assert Punctuation("removed") is not None
|
||||||
assert isinstance(Punctuation(), PreTokenizer)
|
assert isinstance(Punctuation(), PreTokenizer)
|
||||||
assert isinstance(Punctuation(), Punctuation)
|
assert isinstance(Punctuation(), Punctuation)
|
||||||
assert isinstance(pickle.loads(pickle.dumps(Punctuation())), Punctuation)
|
assert isinstance(pickle.loads(pickle.dumps(Punctuation())), Punctuation)
|
||||||
|
@ -1,3 +1,5 @@
|
|||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
||||||
use unicode_categories::UnicodeCategories;
|
use unicode_categories::UnicodeCategories;
|
||||||
|
|
||||||
@ -5,13 +7,27 @@ fn is_punc(x: char) -> bool {
|
|||||||
char::is_ascii_punctuation(&x) || x.is_punctuation()
|
char::is_ascii_punctuation(&x) || x.is_punctuation()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Copy, Clone, Debug)]
|
#[derive(Serialize, Deserialize, Copy, Clone, Debug)]
|
||||||
pub struct Punctuation;
|
#[serde(tag = "type")]
|
||||||
impl_serde_unit_struct!(PunctuationVisitor, Punctuation);
|
pub struct Punctuation {
|
||||||
|
behavior: SplitDelimiterBehavior,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Punctuation {
|
||||||
|
pub fn new(behavior: SplitDelimiterBehavior) -> Self {
|
||||||
|
Self { behavior }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Punctuation {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new(SplitDelimiterBehavior::Isolated)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl PreTokenizer for Punctuation {
|
impl PreTokenizer for Punctuation {
|
||||||
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
|
||||||
pretokenized.split(|_, s| s.split(is_punc, SplitDelimiterBehavior::Isolated))
|
pretokenized.split(|_, s| s.split(is_punc, self.behavior))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -22,7 +38,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn punctuation_basic() {
|
fn punctuation_basic() {
|
||||||
let pretok = Punctuation;
|
let pretok = Punctuation::default();
|
||||||
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
|
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
|
||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -33,7 +33,7 @@ mod tests {
|
|||||||
fn sequence_basic() {
|
fn sequence_basic() {
|
||||||
let pretokenizers = vec![
|
let pretokenizers = vec![
|
||||||
PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit),
|
PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit),
|
||||||
PreTokenizerWrapper::Punctuation(Punctuation),
|
PreTokenizerWrapper::Punctuation(Punctuation::default()),
|
||||||
];
|
];
|
||||||
let pretok = Sequence::new(pretokenizers);
|
let pretok = Sequence::new(pretokenizers);
|
||||||
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
|
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
|
||||||
|
Reference in New Issue
Block a user