mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 11:18:29 +00:00
Finish exposing the UnicodeScripts PreTokenizer
This commit is contained in:
@@ -10,3 +10,4 @@ BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
|||||||
Metaspace = pre_tokenizers.Metaspace
|
Metaspace = pre_tokenizers.Metaspace
|
||||||
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
|
||||||
Digits = pre_tokenizers.Digits
|
Digits = pre_tokenizers.Digits
|
||||||
|
UnicodeScripts = pre_tokenizers.UnicodeScripts
|
||||||
|
|||||||
@@ -148,3 +148,16 @@ class Digits(PreTokenizer):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class UnicodeScripts(PreTokenizer):
|
||||||
|
"""UnicodeScripts PreTokenizer
|
||||||
|
|
||||||
|
This pre-tokenizer splits on characters that belong to different language family
|
||||||
|
It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
|
||||||
|
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
|
||||||
|
This mimicks SentencePiece Unigram implementation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
""" Instantiate a new UnicodeScripts """
|
||||||
|
pass
|
||||||
|
|||||||
@@ -72,6 +72,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<pre_tokenizers::PyPunctuation>()?;
|
m.add_class::<pre_tokenizers::PyPunctuation>()?;
|
||||||
m.add_class::<pre_tokenizers::PySequence>()?;
|
m.add_class::<pre_tokenizers::PySequence>()?;
|
||||||
m.add_class::<pre_tokenizers::PyDigits>()?;
|
m.add_class::<pre_tokenizers::PyDigits>()?;
|
||||||
|
m.add_class::<pre_tokenizers::PyUnicodeScripts>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
|||||||
use tk::pre_tokenizers::digits::Digits;
|
use tk::pre_tokenizers::digits::Digits;
|
||||||
use tk::pre_tokenizers::metaspace::Metaspace;
|
use tk::pre_tokenizers::metaspace::Metaspace;
|
||||||
use tk::pre_tokenizers::punctuation::Punctuation;
|
use tk::pre_tokenizers::punctuation::Punctuation;
|
||||||
|
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
||||||
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
||||||
use tk::pre_tokenizers::PreTokenizerWrapper;
|
use tk::pre_tokenizers::PreTokenizerWrapper;
|
||||||
use tk::tokenizer::Offsets;
|
use tk::tokenizer::Offsets;
|
||||||
@@ -70,6 +71,9 @@ impl PyPreTokenizer {
|
|||||||
Py::new(py, (PyBertPreTokenizer {}, base))?.into_py(py)
|
Py::new(py, (PyBertPreTokenizer {}, base))?.into_py(py)
|
||||||
}
|
}
|
||||||
PreTokenizerWrapper::Digits(_) => Py::new(py, (PyDigits {}, base))?.into_py(py),
|
PreTokenizerWrapper::Digits(_) => Py::new(py, (PyDigits {}, base))?.into_py(py),
|
||||||
|
PreTokenizerWrapper::UnicodeScripts(_) => {
|
||||||
|
Py::new(py, (PyUnicodeScripts {}, base))?.into_py(py)
|
||||||
|
}
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
@@ -297,6 +301,16 @@ impl PyDigits {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=UnicodeScripts)]
|
||||||
|
pub struct PyUnicodeScripts {}
|
||||||
|
#[pymethods]
|
||||||
|
impl PyUnicodeScripts {
|
||||||
|
#[new]
|
||||||
|
fn new() -> PyResult<(Self, PyPreTokenizer)> {
|
||||||
|
Ok((PyUnicodeScripts {}, UnicodeScripts::new().into()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub(crate) struct CustomPreTokenizer {
|
pub(crate) struct CustomPreTokenizer {
|
||||||
inner: PyObject,
|
inner: PyObject,
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ from tokenizers.pre_tokenizers import (
|
|||||||
Punctuation,
|
Punctuation,
|
||||||
Sequence,
|
Sequence,
|
||||||
Digits,
|
Digits,
|
||||||
|
UnicodeScripts,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -121,6 +122,14 @@ class TestDigits:
|
|||||||
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
|
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
|
||||||
|
|
||||||
|
|
||||||
|
class TestUnicodeScripts:
|
||||||
|
def test_instantiate(self):
|
||||||
|
assert UnicodeScripts() is not None
|
||||||
|
assert isinstance(UnicodeScripts(), PreTokenizer)
|
||||||
|
assert isinstance(UnicodeScripts(), UnicodeScripts)
|
||||||
|
assert isinstance(pickle.loads(pickle.dumps(UnicodeScripts())), UnicodeScripts)
|
||||||
|
|
||||||
|
|
||||||
class TestCustomPreTokenizer:
|
class TestCustomPreTokenizer:
|
||||||
class BadCustomPretok:
|
class BadCustomPretok:
|
||||||
def pre_tokenize(self, pretok, wrong):
|
def pre_tokenize(self, pretok, wrong):
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ pub mod digits;
|
|||||||
pub mod metaspace;
|
pub mod metaspace;
|
||||||
pub mod punctuation;
|
pub mod punctuation;
|
||||||
pub mod sequence;
|
pub mod sequence;
|
||||||
|
pub mod unicode_scripts;
|
||||||
pub mod whitespace;
|
pub mod whitespace;
|
||||||
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@@ -16,6 +17,7 @@ use crate::pre_tokenizers::digits::Digits;
|
|||||||
use crate::pre_tokenizers::metaspace::Metaspace;
|
use crate::pre_tokenizers::metaspace::Metaspace;
|
||||||
use crate::pre_tokenizers::punctuation::Punctuation;
|
use crate::pre_tokenizers::punctuation::Punctuation;
|
||||||
use crate::pre_tokenizers::sequence::Sequence;
|
use crate::pre_tokenizers::sequence::Sequence;
|
||||||
|
use crate::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
||||||
use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
|
||||||
use crate::{PreTokenizedString, PreTokenizer};
|
use crate::{PreTokenizedString, PreTokenizer};
|
||||||
|
|
||||||
@@ -31,6 +33,7 @@ pub enum PreTokenizerWrapper {
|
|||||||
Punctuation(Punctuation),
|
Punctuation(Punctuation),
|
||||||
WhitespaceSplit(WhitespaceSplit),
|
WhitespaceSplit(WhitespaceSplit),
|
||||||
Digits(Digits),
|
Digits(Digits),
|
||||||
|
UnicodeScripts(UnicodeScripts),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PreTokenizer for PreTokenizerWrapper {
|
impl PreTokenizer for PreTokenizerWrapper {
|
||||||
@@ -45,6 +48,7 @@ impl PreTokenizer for PreTokenizerWrapper {
|
|||||||
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
|
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
|
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
|
||||||
PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized),
|
PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized),
|
||||||
|
PreTokenizerWrapper::UnicodeScripts(us) => us.pre_tokenize(normalized),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -58,3 +62,4 @@ impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
|
|||||||
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
|
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
|
||||||
impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit);
|
impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit);
|
||||||
impl_enum_from!(Digits, PreTokenizerWrapper, Digits);
|
impl_enum_from!(Digits, PreTokenizerWrapper, Digits);
|
||||||
|
impl_enum_from!(UnicodeScripts, PreTokenizerWrapper, UnicodeScripts);
|
||||||
|
|||||||
5
tokenizers/src/pre_tokenizers/unicode_scripts/mod.rs
Normal file
5
tokenizers/src/pre_tokenizers/unicode_scripts/mod.rs
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
mod pre_tokenizer;
|
||||||
|
mod scripts;
|
||||||
|
|
||||||
|
// Re-export the PreTokenizer
|
||||||
|
pub use pre_tokenizer::UnicodeScripts;
|
||||||
@@ -79,6 +79,7 @@ impl PreTokenizer for UnicodeScripts {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::OffsetReferential;
|
use crate::OffsetReferential;
|
||||||
|
use crate::OffsetType;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn basic() {
|
fn basic() {
|
||||||
@@ -87,7 +88,7 @@ mod tests {
|
|||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized
|
pretokenized
|
||||||
.get_splits(OffsetReferential::Normalized)
|
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(s, o, _)| (s, o))
|
.map(|(s, o, _)| (s, o))
|
||||||
.collect::<Vec<_>>(),
|
.collect::<Vec<_>>(),
|
||||||
@@ -95,7 +96,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized
|
pretokenized
|
||||||
.get_splits(OffsetReferential::Original)
|
.get_splits(OffsetReferential::Original, OffsetType::Byte)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(s, o, _)| (s, o))
|
.map(|(s, o, _)| (s, o))
|
||||||
.collect::<Vec<_>>(),
|
.collect::<Vec<_>>(),
|
||||||
@@ -110,7 +111,7 @@ mod tests {
|
|||||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized
|
pretokenized
|
||||||
.get_splits(OffsetReferential::Normalized)
|
.get_splits(OffsetReferential::Normalized, OffsetType::Byte)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(s, o, _)| (s, o))
|
.map(|(s, o, _)| (s, o))
|
||||||
.collect::<Vec<_>>(),
|
.collect::<Vec<_>>(),
|
||||||
@@ -118,7 +119,7 @@ mod tests {
|
|||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
pretokenized
|
pretokenized
|
||||||
.get_splits(OffsetReferential::Original)
|
.get_splits(OffsetReferential::Original, OffsetType::Byte)
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(s, o, _)| (s, o))
|
.map(|(s, o, _)| (s, o))
|
||||||
.collect::<Vec<_>>(),
|
.collect::<Vec<_>>(),
|
||||||
|
|||||||
Reference in New Issue
Block a user