Finish exposing the UnicodeScripts PreTokenizer

This commit is contained in:
Anthony MOI
2020-10-20 16:38:28 -04:00
committed by Anthony MOI
parent 25e74b5400
commit a2289d49b4
8 changed files with 53 additions and 4 deletions

View File

@@ -10,3 +10,4 @@ BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace Metaspace = pre_tokenizers.Metaspace
CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit CharDelimiterSplit = pre_tokenizers.CharDelimiterSplit
Digits = pre_tokenizers.Digits Digits = pre_tokenizers.Digits
UnicodeScripts = pre_tokenizers.UnicodeScripts

View File

@@ -148,3 +148,16 @@ class Digits(PreTokenizer):
""" """
pass pass
class UnicodeScripts(PreTokenizer):
"""UnicodeScripts PreTokenizer
This pre-tokenizer splits on characters that belong to different language family
It roughly follows https://github.com/google/sentencepiece/blob/master/data/Scripts.txt
Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too.
This mimicks SentencePiece Unigram implementation.
"""
def __init__(self) -> None:
""" Instantiate a new UnicodeScripts """
pass

View File

@@ -72,6 +72,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<pre_tokenizers::PyPunctuation>()?; m.add_class::<pre_tokenizers::PyPunctuation>()?;
m.add_class::<pre_tokenizers::PySequence>()?; m.add_class::<pre_tokenizers::PySequence>()?;
m.add_class::<pre_tokenizers::PyDigits>()?; m.add_class::<pre_tokenizers::PyDigits>()?;
m.add_class::<pre_tokenizers::PyUnicodeScripts>()?;
Ok(()) Ok(())
} }

View File

@@ -12,6 +12,7 @@ use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
use tk::pre_tokenizers::digits::Digits; use tk::pre_tokenizers::digits::Digits;
use tk::pre_tokenizers::metaspace::Metaspace; use tk::pre_tokenizers::metaspace::Metaspace;
use tk::pre_tokenizers::punctuation::Punctuation; use tk::pre_tokenizers::punctuation::Punctuation;
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit}; use tk::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
use tk::pre_tokenizers::PreTokenizerWrapper; use tk::pre_tokenizers::PreTokenizerWrapper;
use tk::tokenizer::Offsets; use tk::tokenizer::Offsets;
@@ -70,6 +71,9 @@ impl PyPreTokenizer {
Py::new(py, (PyBertPreTokenizer {}, base))?.into_py(py) Py::new(py, (PyBertPreTokenizer {}, base))?.into_py(py)
} }
PreTokenizerWrapper::Digits(_) => Py::new(py, (PyDigits {}, base))?.into_py(py), PreTokenizerWrapper::Digits(_) => Py::new(py, (PyDigits {}, base))?.into_py(py),
PreTokenizerWrapper::UnicodeScripts(_) => {
Py::new(py, (PyUnicodeScripts {}, base))?.into_py(py)
}
}, },
}, },
}) })
@@ -297,6 +301,16 @@ impl PyDigits {
} }
} }
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=UnicodeScripts)]
pub struct PyUnicodeScripts {}
#[pymethods]
impl PyUnicodeScripts {
#[new]
fn new() -> PyResult<(Self, PyPreTokenizer)> {
Ok((PyUnicodeScripts {}, UnicodeScripts::new().into()))
}
}
#[derive(Clone)] #[derive(Clone)]
pub(crate) struct CustomPreTokenizer { pub(crate) struct CustomPreTokenizer {
inner: PyObject, inner: PyObject,

View File

@@ -12,6 +12,7 @@ from tokenizers.pre_tokenizers import (
Punctuation, Punctuation,
Sequence, Sequence,
Digits, Digits,
UnicodeScripts,
) )
@@ -121,6 +122,14 @@ class TestDigits:
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits) assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
class TestUnicodeScripts:
def test_instantiate(self):
assert UnicodeScripts() is not None
assert isinstance(UnicodeScripts(), PreTokenizer)
assert isinstance(UnicodeScripts(), UnicodeScripts)
assert isinstance(pickle.loads(pickle.dumps(UnicodeScripts())), UnicodeScripts)
class TestCustomPreTokenizer: class TestCustomPreTokenizer:
class BadCustomPretok: class BadCustomPretok:
def pre_tokenize(self, pretok, wrong): def pre_tokenize(self, pretok, wrong):

View File

@@ -5,6 +5,7 @@ pub mod digits;
pub mod metaspace; pub mod metaspace;
pub mod punctuation; pub mod punctuation;
pub mod sequence; pub mod sequence;
pub mod unicode_scripts;
pub mod whitespace; pub mod whitespace;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -16,6 +17,7 @@ use crate::pre_tokenizers::digits::Digits;
use crate::pre_tokenizers::metaspace::Metaspace; use crate::pre_tokenizers::metaspace::Metaspace;
use crate::pre_tokenizers::punctuation::Punctuation; use crate::pre_tokenizers::punctuation::Punctuation;
use crate::pre_tokenizers::sequence::Sequence; use crate::pre_tokenizers::sequence::Sequence;
use crate::pre_tokenizers::unicode_scripts::UnicodeScripts;
use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit}; use crate::pre_tokenizers::whitespace::{Whitespace, WhitespaceSplit};
use crate::{PreTokenizedString, PreTokenizer}; use crate::{PreTokenizedString, PreTokenizer};
@@ -31,6 +33,7 @@ pub enum PreTokenizerWrapper {
Punctuation(Punctuation), Punctuation(Punctuation),
WhitespaceSplit(WhitespaceSplit), WhitespaceSplit(WhitespaceSplit),
Digits(Digits), Digits(Digits),
UnicodeScripts(UnicodeScripts),
} }
impl PreTokenizer for PreTokenizerWrapper { impl PreTokenizer for PreTokenizerWrapper {
@@ -45,6 +48,7 @@ impl PreTokenizer for PreTokenizerWrapper {
PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized), PreTokenizerWrapper::Sequence(tok) => tok.pre_tokenize(normalized),
PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized), PreTokenizerWrapper::WhitespaceSplit(wspt) => wspt.pre_tokenize(normalized),
PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized), PreTokenizerWrapper::Digits(wspt) => wspt.pre_tokenize(normalized),
PreTokenizerWrapper::UnicodeScripts(us) => us.pre_tokenize(normalized),
} }
} }
} }
@@ -58,3 +62,4 @@ impl_enum_from!(Sequence, PreTokenizerWrapper, Sequence);
impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace); impl_enum_from!(Metaspace, PreTokenizerWrapper, Metaspace);
impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit); impl_enum_from!(WhitespaceSplit, PreTokenizerWrapper, WhitespaceSplit);
impl_enum_from!(Digits, PreTokenizerWrapper, Digits); impl_enum_from!(Digits, PreTokenizerWrapper, Digits);
impl_enum_from!(UnicodeScripts, PreTokenizerWrapper, UnicodeScripts);

View File

@@ -0,0 +1,5 @@
mod pre_tokenizer;
mod scripts;
// Re-export the PreTokenizer
pub use pre_tokenizer::UnicodeScripts;

View File

@@ -79,6 +79,7 @@ impl PreTokenizer for UnicodeScripts {
mod tests { mod tests {
use super::*; use super::*;
use crate::OffsetReferential; use crate::OffsetReferential;
use crate::OffsetType;
#[test] #[test]
fn basic() { fn basic() {
@@ -87,7 +88,7 @@ mod tests {
pretok.pre_tokenize(&mut pretokenized).unwrap(); pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!( assert_eq!(
pretokenized pretokenized
.get_splits(OffsetReferential::Normalized) .get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter() .into_iter()
.map(|(s, o, _)| (s, o)) .map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
@@ -95,7 +96,7 @@ mod tests {
); );
assert_eq!( assert_eq!(
pretokenized pretokenized
.get_splits(OffsetReferential::Original) .get_splits(OffsetReferential::Original, OffsetType::Byte)
.into_iter() .into_iter()
.map(|(s, o, _)| (s, o)) .map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
@@ -110,7 +111,7 @@ mod tests {
pretok.pre_tokenize(&mut pretokenized).unwrap(); pretok.pre_tokenize(&mut pretokenized).unwrap();
assert_eq!( assert_eq!(
pretokenized pretokenized
.get_splits(OffsetReferential::Normalized) .get_splits(OffsetReferential::Normalized, OffsetType::Byte)
.into_iter() .into_iter()
.map(|(s, o, _)| (s, o)) .map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
@@ -118,7 +119,7 @@ mod tests {
); );
assert_eq!( assert_eq!(
pretokenized pretokenized
.get_splits(OffsetReferential::Original) .get_splits(OffsetReferential::Original, OffsetType::Byte)
.into_iter() .into_iter()
.map(|(s, o, _)| (s, o)) .map(|(s, o, _)| (s, o))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),