Python - PreTokenizers can get/set their attributes

This commit is contained in:
Anthony MOI
2020-11-24 13:55:59 -05:00
committed by Anthony MOI
parent 5c35fafc44
commit 3eb7ef6d0a
6 changed files with 134 additions and 5 deletions

View File

@@ -115,7 +115,7 @@ macro_rules! getter {
} else {
unreachable!()
}
}}
}};
}
macro_rules! setter {

View File

@@ -179,6 +179,45 @@ impl PyPreTokenizer {
}
}
macro_rules! getter {
($self: ident, $variant: ident, $($name: tt)+) => {{
let super_ = $self.as_ref();
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref pretok)) =
*single.read().unwrap() {
pretok.$($name)+
} else {
unreachable!()
}
} else {
unreachable!()
}
}};
}
macro_rules! setter {
($self: ident, $variant: ident, $name: ident, $value: expr) => {{
let super_ = $self.as_ref();
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref mut pretok)) =
*single.write().unwrap()
{
pretok.$name = $value;
}
}
}};
($self: ident, $variant: ident, @$name: ident, $value: expr) => {{
let super_ = $self.as_ref();
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref mut pretok)) =
*single.write().unwrap()
{
pretok.$name($value);
}
}
}};
}
/// ByteLevel PreTokenizer
///
/// This pre-tokenizer takes care of replacing all bytes of the given string
@@ -193,6 +232,16 @@ impl PyPreTokenizer {
pub struct PyByteLevel {}
#[pymethods]
impl PyByteLevel {
#[getter]
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
getter!(self_, ByteLevel, add_prefix_space)
}
#[setter]
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
setter!(self_, ByteLevel, add_prefix_space, add_prefix_space);
}
#[new]
#[args(add_prefix_space = "true")]
fn new(add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
@@ -297,6 +346,16 @@ impl PySplit {
pub struct PyCharDelimiterSplit {}
#[pymethods]
impl PyCharDelimiterSplit {
#[getter]
fn get_delimiter(self_: PyRef<Self>) -> String {
getter!(self_, Delimiter, delimiter.to_string())
}
#[setter]
fn set_delimiter(self_: PyRef<Self>, delimiter: PyChar) {
setter!(self_, Delimiter, delimiter, delimiter.0);
}
#[new]
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
Ok((
@@ -384,6 +443,26 @@ impl PySequence {
pub struct PyMetaspace {}
#[pymethods]
impl PyMetaspace {
#[getter]
fn get_replacement(self_: PyRef<Self>) -> String {
getter!(self_, Metaspace, get_replacement().to_string())
}
#[setter]
fn set_replacement(self_: PyRef<Self>, replacement: PyChar) {
setter!(self_, Metaspace, @set_replacement, replacement.0);
}
#[getter]
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, add_prefix_space)
}
#[setter]
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
}
#[new]
#[args(replacement = "PyChar('▁')", add_prefix_space = "true")]
fn new(replacement: PyChar, add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
@@ -410,6 +489,16 @@ impl PyMetaspace {
pub struct PyDigits {}
#[pymethods]
impl PyDigits {
#[getter]
fn get_individual_digits(self_: PyRef<Self>) -> bool {
getter!(self_, Digits, individual_digits)
}
#[setter]
fn set_individual_digits(self_: PyRef<Self>, individual_digits: bool) {
setter!(self_, Digits, individual_digits, individual_digits);
}
#[new]
#[args(individual_digits = false)]
fn new(individual_digits: bool) -> PyResult<(Self, PyPreTokenizer)> {

View File

@@ -30,6 +30,15 @@ class TestByteLevel:
assert isinstance(ByteLevel.alphabet(), list)
assert len(ByteLevel.alphabet()) == 256
def test_can_modify(self):
pretok = ByteLevel(add_prefix_space=False)
assert pretok.add_prefix_space == False
# Modify these
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
class TestSplit:
def test_instantiate(self):
@@ -82,6 +91,18 @@ class TestMetaspace:
assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
def test_can_modify(self):
pretok = Metaspace(replacement="$", add_prefix_space=False)
assert pretok.replacement == "$"
assert pretok.add_prefix_space == False
# Modify these
pretok.replacement = "%"
assert pretok.replacement == "%"
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
class TestCharDelimiterSplit:
def test_instantiate(self):
@@ -92,6 +113,14 @@ class TestCharDelimiterSplit:
assert isinstance(CharDelimiterSplit(" "), CharDelimiterSplit)
assert isinstance(pickle.loads(pickle.dumps(CharDelimiterSplit("-"))), CharDelimiterSplit)
def test_can_modify(self):
pretok = CharDelimiterSplit("@")
assert pretok.delimiter == "@"
# Modify these
pretok.delimiter = "!"
assert pretok.delimiter == "!"
class TestPunctuation:
def test_instantiate(self):
@@ -138,6 +167,14 @@ class TestDigits:
assert isinstance(Digits(False), Digits)
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
def test_can_modify(self):
pretok = Digits(individual_digits=False)
assert pretok.individual_digits == False
# Modify these
pretok.individual_digits = True
assert pretok.individual_digits == True
class TestUnicodeScripts:
def test_instantiate(self):