Add SplitDelimiterBehavior to Punctuation constructor (#657)

Resolves: #642
2025-12-03 19:28:20 +00:00 · 2021-08-13 09:19:23 -04:00
parent c1100dcbe3
commit e2bf8daa3a
10 changed files with 69 additions and 17 deletions
--- a/bindings/python/CHANGELOG.md
+++ b/bindings/python/CHANGELOG.md
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

+## [Unreleased] 
+
+### Added
+- [#657]: Add SplitDelimiterBehavior customization to Punctuation constructor
+
 ## [0.10.3]

 ### Fixed
@@ -326,6 +331,7 @@ delimiter (Works like `.split(delimiter)`)
 [#693]: https://github.com/huggingface/tokenizers/pull/693
 [#686]: https://github.com/huggingface/tokenizers/pull/686
 [#674]: https://github.com/huggingface/tokenizers/pull/674
+[#657]: https://github.com/huggingface/tokenizers/pull/657
 [#656]: https://github.com/huggingface/tokenizers/pull/656
 [#652]: https://github.com/huggingface/tokenizers/pull/652
 [#621]: https://github.com/huggingface/tokenizers/pull/621
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -308,10 +308,16 @@ class Metaspace(PreTokenizer):

 class Punctuation(PreTokenizer):
    """
-    This pre-tokenizer simply splits on punctuation as individual characters.`
+    This pre-tokenizer simply splits on punctuation as individual characters.
+
+    Args:
+        behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+            The behavior to use when splitting.
+            Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
+            "contiguous"
    """

-    def __init__(self):
+    def __init__(self, behavior="isolated"):
        pass
    def pre_tokenize(self, pretok):
        """
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -6,6 +6,7 @@ use pyo3::types::*;
 use serde::ser::SerializeStruct;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};

+use tk::normalizer::SplitDelimiterBehavior;
 use tk::pre_tokenizers::bert::BertPreTokenizer;
 use tk::pre_tokenizers::byte_level::ByteLevel;
 use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
@@ -384,15 +385,22 @@ impl PyBertPreTokenizer {
    }
 }

-/// This pre-tokenizer simply splits on punctuation as individual characters.`
+/// This pre-tokenizer simply splits on punctuation as individual characters.
+///
+/// Args:
+///     behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
+///         The behavior to use when splitting.
+///         Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
+///         "contiguous"
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
-#[text_signature = "(self)"]
+#[text_signature = "(self, behavior=\"isolated\")"]
 pub struct PyPunctuation {}
 #[pymethods]
 impl PyPunctuation {
    #[new]
-    fn new() -> (Self, PyPreTokenizer) {
-        (PyPunctuation {}, Punctuation.into())
+    #[args(behavior = "PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)")]
+    fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
+        (PyPunctuation {}, Punctuation::new(behavior.into()).into())
    }
 }

--- a/bindings/python/src/utils/normalization.rs
+++ b/bindings/python/src/utils/normalization.rs
@@ -92,7 +92,7 @@ impl PyRange<'_> {
 }

 #[derive(Clone)]
-pub struct PySplitDelimiterBehavior(SplitDelimiterBehavior);
+pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior);

 impl FromPyObject<'_> for PySplitDelimiterBehavior {
    fn extract(obj: &PyAny) -> PyResult<Self> {
--- a/bindings/python/tests/bindings/test_pre_tokenizers.py
+++ b/bindings/python/tests/bindings/test_pre_tokenizers.py
@@ -132,6 +132,7 @@ class TestCharDelimiterSplit:
 class TestPunctuation:
    def test_instantiate(self):
        assert Punctuation() is not None
+        assert Punctuation("removed") is not None
        assert isinstance(Punctuation(), PreTokenizer)
        assert isinstance(Punctuation(), Punctuation)
        assert isinstance(pickle.loads(pickle.dumps(Punctuation())), Punctuation)