diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py index 80d671e3..2a4e22a1 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.py @@ -10,3 +10,4 @@ BertPreTokenizer = pre_tokenizers.BertPreTokenizer Metaspace = pre_tokenizers.Metaspace CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit Digits = pre_tokenizers.Digits +UnicodeScripts = pre_tokenizers.UnicodeScripts diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index 3c9716ae..931e2d3a 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -148,3 +148,16 @@ class Digits(PreTokenizer): """ pass + +class UnicodeScripts(PreTokenizer): + """UnicodeScripts PreTokenizer + + This pre-tokenizer splits on characters that belong to different language family + It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt + Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. + This mimicks SentencePiece Unigram implementation. + """ + + def __init__(self) -> None: + """ Instantiate a new UnicodeScripts """ + pass diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 643476fd..7cd727ff 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -72,6 +72,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; Ok(()) } diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index c4c669a9..fcfea906 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -12,6 +12,7 @@ use tk::pre_tokenizers::delimiter::CharDelimiterSplit; use tk::pre_tokenizers::digits::Digits; use tk::pre_tokenizers::metaspace::Metaspace; use tk::pre_tokenizers::punctuation::Punctuation; +use tk::pre_tokenizers::unicode_scripts::UnicodeScripts; use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit}; use tk::pre_tokenizers::PreTokenizerWrapper; use tk::tokenizer::Offsets; @@ -70,6 +71,9 @@ impl PyPreTokenizer { Py::new(py, (PyBertPreTokenizer {}, base))?.into_py(py) } PreTokenizerWrapper::Digits(_) => Py::new(py, (PyDigits {}, base))?.into_py(py), + PreTokenizerWrapper::UnicodeScripts(_) => { + Py::new(py, (PyUnicodeScripts {}, base))?.into_py(py) + } }, }, }) @@ -297,6 +301,16 @@ impl PyDigits { } } +#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=UnicodeScripts)] +pub struct PyUnicodeScripts {} +#[pymethods] +impl PyUnicodeScripts { + #[new] + fn new() -> PyResult<(Self, PyPreTokenizer)> { + Ok((PyUnicodeScripts {}, UnicodeScripts::new().into())) + } +} + #[derive(Clone)] pub(crate) struct CustomPreTokenizer { inner: PyObject, diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index 3a48c649..02622419 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -12,6 +12,7 @@ from tokenizers.pre_tokenizers import ( Punctuation, Sequence, Digits, + UnicodeScripts, ) @@ -121,6 +122,14 @@ class TestDigits: assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits) +class TestUnicodeScripts: + def test_instantiate(self): + assert UnicodeScripts() is not None + assert isinstance(UnicodeScripts(), PreTokenizer) + assert isinstance(UnicodeScripts(), UnicodeScripts) + assert isinstance(pickle.loads(pickle.dumps(UnicodeScripts())), UnicodeScripts) + + class TestCustomPreTokenizer: class BadCustomPretok: def pre_tokenize(self, pretok, wrong): diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index 3e71befe..7197d3d4 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -5,6 +5,7 @@ pub mod digits; pub mod metaspace; pub mod punctuation; pub mod sequence; +pub mod unicode_scripts; pub mod whitespace; use serde::{Deserialize, Serialize}; @@ -16,6 +17,7 @@ use crate::pre_tokenizers::digits::Digits; use crate::pre_tokenizers::metaspace::Metaspace; use crate::pre_tokenizers::punctuation::Punctuation; use crate::pre_tokenizers::sequence::Sequence; +use crate::pre_tokenizers::unicode_scripts::UnicodeScripts; use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit}; use crate::{PreTokenizedString, PreTokenizer}; @@ -31,6 +33,7 @@ pub enum PreTokenizerWrapper { Punctuation(Punctuation), WhitespaceSplit(WhitespaceSplit), Digits(Digits), + UnicodeScripts(UnicodeScripts), } impl PreTokenizer for PreTokenizerWrapper { @@ -45,6 +48,7 @@ impl PreTokenizer for PreTokenizerWrapper { PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized), PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized), PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized), + PreTokenizerWrapper::UnicodeScripts(us) => us.pre_tokenize(normalized), } } } @@ -58,3 +62,4 @@ impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence); impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace); impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit); impl_enum_from!(Digits, PreTokenizerWrapper, Digits); +impl_enum_from!(UnicodeScripts, PreTokenizerWrapper, UnicodeScripts); diff --git a/tokenizers/src/pre_tokenizers/unicode_scripts/mod.rs b/tokenizers/src/pre_tokenizers/unicode_scripts/mod.rs new file mode 100644 index 00000000..5e6e7eb3 --- /dev/null +++ b/tokenizers/src/pre_tokenizers/unicode_scripts/mod.rs @@ -0,0 +1,5 @@ +mod pre_tokenizer; +mod scripts; + +// Re-export the PreTokenizer +pub use pre_tokenizer::UnicodeScripts; diff --git a/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs b/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs index 15c84ded..88413ec5 100644 --- a/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs +++ b/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs @@ -79,6 +79,7 @@ impl PreTokenizer for UnicodeScripts { mod tests { use super::*; use crate::OffsetReferential; + use crate::OffsetType; #[test] fn basic() { @@ -87,7 +88,7 @@ mod tests { pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( pretokenized - .get_splits(OffsetReferential::Normalized) + .get_splits(OffsetReferential::Normalized, OffsetType::Byte) .into_iter() .map(|(s, o, _)| (s, o)) .collect::>(), @@ -95,7 +96,7 @@ mod tests { ); assert_eq!( pretokenized - .get_splits(OffsetReferential::Original) + .get_splits(OffsetReferential::Original, OffsetType::Byte) .into_iter() .map(|(s, o, _)| (s, o)) .collect::>(), @@ -110,7 +111,7 @@ mod tests { pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( pretokenized - .get_splits(OffsetReferential::Normalized) + .get_splits(OffsetReferential::Normalized, OffsetType::Byte) .into_iter() .map(|(s, o, _)| (s, o)) .collect::>(), @@ -118,7 +119,7 @@ mod tests { ); assert_eq!( pretokenized - .get_splits(OffsetReferential::Original) + .get_splits(OffsetReferential::Original, OffsetType::Byte) .into_iter() .map(|(s, o, _)| (s, o)) .collect::>(),