mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 19:28:20 +00:00
Add SplitDelimiterBehavior to Punctuation constructor (#657)
Resolves: #642
This commit is contained in:
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- [#657]: Add SplitDelimiterBehavior customization to Punctuation constructor
|
||||
|
||||
## [0.10.3]
|
||||
|
||||
### Fixed
|
||||
@@ -326,6 +331,7 @@ delimiter (Works like `.split(delimiter)`)
|
||||
[#693]: https://github.com/huggingface/tokenizers/pull/693
|
||||
[#686]: https://github.com/huggingface/tokenizers/pull/686
|
||||
[#674]: https://github.com/huggingface/tokenizers/pull/674
|
||||
[#657]: https://github.com/huggingface/tokenizers/pull/657
|
||||
[#656]: https://github.com/huggingface/tokenizers/pull/656
|
||||
[#652]: https://github.com/huggingface/tokenizers/pull/652
|
||||
[#621]: https://github.com/huggingface/tokenizers/pull/621
|
||||
|
||||
@@ -308,10 +308,16 @@ class Metaspace(PreTokenizer):
|
||||
|
||||
class Punctuation(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||
This pre-tokenizer simply splits on punctuation as individual characters.
|
||||
|
||||
Args:
|
||||
behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
The behavior to use when splitting.
|
||||
Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
||||
"contiguous"
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, behavior="isolated"):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
|
||||
@@ -6,6 +6,7 @@ use pyo3::types::*;
|
||||
use serde::ser::SerializeStruct;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
|
||||
use tk::normalizer::SplitDelimiterBehavior;
|
||||
use tk::pre_tokenizers::bert::BertPreTokenizer;
|
||||
use tk::pre_tokenizers::byte_level::ByteLevel;
|
||||
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||
@@ -384,15 +385,22 @@ impl PyBertPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
/// This pre-tokenizer simply splits on punctuation as individual characters.`
|
||||
/// This pre-tokenizer simply splits on punctuation as individual characters.
|
||||
///
|
||||
/// Args:
|
||||
/// behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
|
||||
/// The behavior to use when splitting.
|
||||
/// Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next",
|
||||
/// "contiguous"
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=Punctuation)]
|
||||
#[text_signature = "(self)"]
|
||||
#[text_signature = "(self, behavior=\"isolated\")"]
|
||||
pub struct PyPunctuation {}
|
||||
#[pymethods]
|
||||
impl PyPunctuation {
|
||||
#[new]
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyPunctuation {}, Punctuation.into())
|
||||
#[args(behavior = "PySplitDelimiterBehavior(SplitDelimiterBehavior::Isolated)")]
|
||||
fn new(behavior: PySplitDelimiterBehavior) -> (Self, PyPreTokenizer) {
|
||||
(PyPunctuation {}, Punctuation::new(behavior.into()).into())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -92,7 +92,7 @@ impl PyRange<'_> {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PySplitDelimiterBehavior(SplitDelimiterBehavior);
|
||||
pub struct PySplitDelimiterBehavior(pub SplitDelimiterBehavior);
|
||||
|
||||
impl FromPyObject<'_> for PySplitDelimiterBehavior {
|
||||
fn extract(obj: &PyAny) -> PyResult<Self> {
|
||||
|
||||
@@ -132,6 +132,7 @@ class TestCharDelimiterSplit:
|
||||
class TestPunctuation:
|
||||
def test_instantiate(self):
|
||||
assert Punctuation() is not None
|
||||
assert Punctuation("removed") is not None
|
||||
assert isinstance(Punctuation(), PreTokenizer)
|
||||
assert isinstance(Punctuation(), Punctuation)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Punctuation())), Punctuation)
|
||||
|
||||
Reference in New Issue
Block a user