mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-16 17:18:43 +00:00
Python - PreTokenizers can get/set their attributes
This commit is contained in:
@@ -115,7 +115,7 @@ macro_rules! getter {
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! setter {
|
||||
|
||||
@@ -179,6 +179,45 @@ impl PyPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! getter {
|
||||
($self: ident, $variant: ident, $($name: tt)+) => {{
|
||||
let super_ = $self.as_ref();
|
||||
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
|
||||
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref pretok)) =
|
||||
*single.read().unwrap() {
|
||||
pretok.$($name)+
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! setter {
|
||||
($self: ident, $variant: ident, $name: ident, $value: expr) => {{
|
||||
let super_ = $self.as_ref();
|
||||
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
|
||||
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref mut pretok)) =
|
||||
*single.write().unwrap()
|
||||
{
|
||||
pretok.$name = $value;
|
||||
}
|
||||
}
|
||||
}};
|
||||
($self: ident, $variant: ident, @$name: ident, $value: expr) => {{
|
||||
let super_ = $self.as_ref();
|
||||
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
|
||||
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref mut pretok)) =
|
||||
*single.write().unwrap()
|
||||
{
|
||||
pretok.$name($value);
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// ByteLevel PreTokenizer
|
||||
///
|
||||
/// This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
@@ -193,6 +232,16 @@ impl PyPreTokenizer {
|
||||
pub struct PyByteLevel {}
|
||||
#[pymethods]
|
||||
impl PyByteLevel {
|
||||
#[getter]
|
||||
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
|
||||
getter!(self_, ByteLevel, add_prefix_space)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
|
||||
setter!(self_, ByteLevel, add_prefix_space, add_prefix_space);
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[args(add_prefix_space = "true")]
|
||||
fn new(add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
@@ -297,6 +346,16 @@ impl PySplit {
|
||||
pub struct PyCharDelimiterSplit {}
|
||||
#[pymethods]
|
||||
impl PyCharDelimiterSplit {
|
||||
#[getter]
|
||||
fn get_delimiter(self_: PyRef<Self>) -> String {
|
||||
getter!(self_, Delimiter, delimiter.to_string())
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_delimiter(self_: PyRef<Self>, delimiter: PyChar) {
|
||||
setter!(self_, Delimiter, delimiter, delimiter.0);
|
||||
}
|
||||
|
||||
#[new]
|
||||
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((
|
||||
@@ -384,6 +443,26 @@ impl PySequence {
|
||||
pub struct PyMetaspace {}
|
||||
#[pymethods]
|
||||
impl PyMetaspace {
|
||||
#[getter]
|
||||
fn get_replacement(self_: PyRef<Self>) -> String {
|
||||
getter!(self_, Metaspace, get_replacement().to_string())
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_replacement(self_: PyRef<Self>, replacement: PyChar) {
|
||||
setter!(self_, Metaspace, @set_replacement, replacement.0);
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
|
||||
getter!(self_, Metaspace, add_prefix_space)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
|
||||
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[args(replacement = "PyChar('▁')", add_prefix_space = "true")]
|
||||
fn new(replacement: PyChar, add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
@@ -410,6 +489,16 @@ impl PyMetaspace {
|
||||
pub struct PyDigits {}
|
||||
#[pymethods]
|
||||
impl PyDigits {
|
||||
#[getter]
|
||||
fn get_individual_digits(self_: PyRef<Self>) -> bool {
|
||||
getter!(self_, Digits, individual_digits)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_individual_digits(self_: PyRef<Self>, individual_digits: bool) {
|
||||
setter!(self_, Digits, individual_digits, individual_digits);
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[args(individual_digits = false)]
|
||||
fn new(individual_digits: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
|
||||
@@ -30,6 +30,15 @@ class TestByteLevel:
|
||||
assert isinstance(ByteLevel.alphabet(), list)
|
||||
assert len(ByteLevel.alphabet()) == 256
|
||||
|
||||
def test_can_modify(self):
|
||||
pretok = ByteLevel(add_prefix_space=False)
|
||||
|
||||
assert pretok.add_prefix_space == False
|
||||
|
||||
# Modify these
|
||||
pretok.add_prefix_space = True
|
||||
assert pretok.add_prefix_space == True
|
||||
|
||||
|
||||
class TestSplit:
|
||||
def test_instantiate(self):
|
||||
@@ -82,6 +91,18 @@ class TestMetaspace:
|
||||
assert isinstance(Metaspace(), Metaspace)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
|
||||
|
||||
def test_can_modify(self):
|
||||
pretok = Metaspace(replacement="$", add_prefix_space=False)
|
||||
|
||||
assert pretok.replacement == "$"
|
||||
assert pretok.add_prefix_space == False
|
||||
|
||||
# Modify these
|
||||
pretok.replacement = "%"
|
||||
assert pretok.replacement == "%"
|
||||
pretok.add_prefix_space = True
|
||||
assert pretok.add_prefix_space == True
|
||||
|
||||
|
||||
class TestCharDelimiterSplit:
|
||||
def test_instantiate(self):
|
||||
@@ -92,6 +113,14 @@ class TestCharDelimiterSplit:
|
||||
assert isinstance(CharDelimiterSplit(" "), CharDelimiterSplit)
|
||||
assert isinstance(pickle.loads(pickle.dumps(CharDelimiterSplit("-"))), CharDelimiterSplit)
|
||||
|
||||
def test_can_modify(self):
|
||||
pretok = CharDelimiterSplit("@")
|
||||
assert pretok.delimiter == "@"
|
||||
|
||||
# Modify these
|
||||
pretok.delimiter = "!"
|
||||
assert pretok.delimiter == "!"
|
||||
|
||||
|
||||
class TestPunctuation:
|
||||
def test_instantiate(self):
|
||||
@@ -138,6 +167,14 @@ class TestDigits:
|
||||
assert isinstance(Digits(False), Digits)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
|
||||
|
||||
def test_can_modify(self):
|
||||
pretok = Digits(individual_digits=False)
|
||||
assert pretok.individual_digits == False
|
||||
|
||||
# Modify these
|
||||
pretok.individual_digits = True
|
||||
assert pretok.individual_digits == True
|
||||
|
||||
|
||||
class TestUnicodeScripts:
|
||||
def test_instantiate(self):
|
||||
|
||||
Reference in New Issue
Block a user