Add SplitDelimiterBehavior to Punctuation constructor (#657)

Resolves: #642
This commit is contained in:
Vlad Artamonov
2021-08-13 09:19:23 -04:00
committed by GitHub
parent c1100dcbe3
commit e2bf8daa3a
10 changed files with 69 additions and 17 deletions

View File

@ -90,10 +90,14 @@ export function charDelimiterSplitPreTokenizer(delimiter: string): PreTokenizer;
/** /**
* Returns a new Punctuation PreTokenizer. * Returns a new Punctuation PreTokenizer.
* This pre-tokenizer splits tokens on punctuation. * This pre-tokenizer splits tokens on punctuation according to the provided behavior.
* Each occurrence of a punctuation character will be treated separately. * Each occurrence of a punctuation character is treated separately.
*
* @param [behavior="isolated"] The behavior to use when splitting.
* Choices: "removed", "isolated", "mergedWithPrevious", "mergedWithNext",
* "contiguous"
*/ */
export function punctuationPreTokenizer(): PreTokenizer; export function punctuationPreTokenizer(behavior?: string): PreTokenizer;
/** /**
* Returns a new Sequence PreTokenizer. * Returns a new Sequence PreTokenizer.

View File

@ -43,6 +43,11 @@ describe("punctuationPreTokenizer", () => {
const processor = punctuationPreTokenizer(); const processor = punctuationPreTokenizer();
expect(processor.constructor.name).toEqual("PreTokenizer"); expect(processor.constructor.name).toEqual("PreTokenizer");
}); });
it("instantiates correctly with non-default split delimeter", () => {
const processor = punctuationPreTokenizer("removed");
expect(processor.constructor.name).toEqual("PreTokenizer");
});
}); });
describe("splitPreTokenizer", () => { describe("splitPreTokenizer", () => {

View File

@ -203,9 +203,15 @@ fn split(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
/// punctuation() /// punctuation()
fn punctuation(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> { fn punctuation(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
let behavior: JsSplitDelimiterBehavior = cx
.extract_opt::<JsSplitDelimiterBehavior>(0)?
.unwrap_or(JsSplitDelimiterBehavior(SplitDelimiterBehavior::Isolated));
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?; let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
let guard = cx.lock(); let guard = cx.lock();
pretok.borrow_mut(&guard).pretok = Some(tk::pre_tokenizers::punctuation::Punctuation.into()); pretok.borrow_mut(&guard).pretok =
Some(tk::pre_tokenizers::punctuation::Punctuation::new(behavior.into()).into());
Ok(pretok) Ok(pretok)
} }

View File

@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [Unreleased]
### Added
- [#657]: Add SplitDelimiterBehavior customization to Punctuation constructor
## [0.10.3] ## [0.10.3]
### Fixed ### Fixed
@ -326,6 +331,7 @@ delimiter (Works like `.split(delimiter)`)
[#693]: https://github.com/huggingface/tokenizers/pull/693 [#693]: https://github.com/huggingface/tokenizers/pull/693
[#686]: https://github.com/huggingface/tokenizers/pull/686 [#686]: https://github.com/huggingface/tokenizers/pull/686
[#674]: https://github.com/huggingface/tokenizers/pull/674 [#674]: https://github.com/huggingface/tokenizers/pull/674
[#657]: https://github.com/huggingface/tokenizers/pull/657
[#656]: https://github.com/huggingface/tokenizers/pull/656 [#656]: https://github.com/huggingface/tokenizers/pull/656
[#652]: https://github.com/huggingface/tokenizers/pull/652 [#652]: https://github.com/huggingface/tokenizers/pull/652
[#621]: https://github.com/huggingface/tokenizers/pull/621 [#621]: https://github.com/huggingface/tokenizers/pull/621

View File

@ -308,10 +308,16 @@ class Metaspace(PreTokenizer):
class Punctuation(PreTokenizer): class Punctuation(PreTokenizer):
""" """
This pre-tokenizer simply splits on punctuation as individual characters.` This pre-tokenizer simply splits on punctuation as individual characters.
Args:
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
The behavior to use when splitting.
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
"contiguous"
""" """
def __init__(self): def __init__(self, behavior="isolated"):
pass pass
def pre_tokenize(self, pretok): def pre_tokenize(self, pretok):
""" """

View File

@ -6,6 +6,7 @@ use pyo3::types::*;
use serde::ser::SerializeStruct; use serde::ser::SerializeStruct;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use tk::normalizer::SplitDelimiterBehavior;
use tk::pre_tokenizers::bert::BertPreTokenizer; use tk::pre_tokenizers::bert::BertPreTokenizer;
use tk::pre_tokenizers::byte_level::ByteLevel; use tk::pre_tokenizers::byte_level::ByteLevel;
use tk::pre_tokenizers::delimiter::CharDelimiterSplit; use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
@ -384,15 +385,22 @@ impl PyBertPreTokenizer {
} }
} }
/// This pre-tokenizer simply splits on punctuation as individual characters.` /// This pre-tokenizer simply splits on punctuation as individual characters.
///
/// Args:
/// behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
/// The behavior to use when splitting.
/// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
/// "contiguous"
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
#[text_signature = "(self)"] #[text_signature = "(self, behavior=\"isolated\")"]
pub struct PyPunctuation {} pub struct PyPunctuation {}
#[pymethods] #[pymethods]
impl PyPunctuation { impl PyPunctuation {
#[new] #[new]
fn new() -> (Self, PyPreTokenizer) { #[args(behavior = "PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)")]
(PyPunctuation {}, Punctuation.into()) fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
(PyPunctuation {}, Punctuation::new(behavior.into()).into())
} }
} }

View File

@ -92,7 +92,7 @@ impl PyRange<'_> {
} }
#[derive(Clone)] #[derive(Clone)]
pub struct PySplitDelimiterBehavior(SplitDelimiterBehavior); pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior);
impl FromPyObject<'_> for PySplitDelimiterBehavior { impl FromPyObject<'_> for PySplitDelimiterBehavior {
fn extract(obj: &PyAny) -> PyResult<Self> { fn extract(obj: &PyAny) -> PyResult<Self> {

View File

@ -132,6 +132,7 @@ class TestCharDelimiterSplit:
class TestPunctuation: class TestPunctuation:
def test_instantiate(self): def test_instantiate(self):
assert Punctuation() is not None assert Punctuation() is not None
assert Punctuation("removed") is not None
assert isinstance(Punctuation(), PreTokenizer) assert isinstance(Punctuation(), PreTokenizer)
assert isinstance(Punctuation(), Punctuation) assert isinstance(Punctuation(), Punctuation)
assert isinstance(pickle.loads(pickle.dumps(Punctuation())), Punctuation) assert isinstance(pickle.loads(pickle.dumps(Punctuation())), Punctuation)

View File

@ -1,3 +1,5 @@
use serde::{Deserialize, Serialize};
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use unicode_categories::UnicodeCategories; use unicode_categories::UnicodeCategories;
@ -5,13 +7,27 @@ fn is_punc(x: char) -> bool {
char::is_ascii_punctuation(&x) || x.is_punctuation() char::is_ascii_punctuation(&x) || x.is_punctuation()
} }
#[derive(Copy, Clone, Debug)] #[derive(Serialize, Deserialize, Copy, Clone, Debug)]
pub struct Punctuation; #[serde(tag = "type")]
impl_serde_unit_struct!(PunctuationVisitor, Punctuation); pub struct Punctuation {
behavior: SplitDelimiterBehavior,
}
impl Punctuation {
pub fn new(behavior: SplitDelimiterBehavior) -> Self {
Self { behavior }
}
}
impl Default for Punctuation {
fn default() -> Self {
Self::new(SplitDelimiterBehavior::Isolated)
}
}
impl PreTokenizer for Punctuation { impl PreTokenizer for Punctuation {
fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> { fn pre_tokenize(&self, pretokenized: &mut PreTokenizedString) -> Result<()> {
pretokenized.split(|_, s| s.split(is_punc, SplitDelimiterBehavior::Isolated)) pretokenized.split(|_, s| s.split(is_punc, self.behavior))
} }
} }
@ -22,7 +38,7 @@ mod tests {
#[test] #[test]
fn punctuation_basic() { fn punctuation_basic() {
let pretok = Punctuation; let pretok = Punctuation::default();
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into(); let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();
pretok.pre_tokenize(&mut pretokenized).unwrap(); pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!( assert_eq!(

View File

@ -33,7 +33,7 @@ mod tests {
fn sequence_basic() { fn sequence_basic() {
let pretokenizers = vec![ let pretokenizers = vec![
PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit), PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit),
PreTokenizerWrapper::Punctuation(Punctuation), PreTokenizerWrapper::Punctuation(Punctuation::default()),
]; ];
let pretok = Sequence::new(pretokenizers); let pretok = Sequence::new(pretokenizers);
let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into(); let mut pretokenized: PreTokenizedString = "Hey friend! How are you?!?".into();