Python - PreTokenizers can get/set their attributes

This commit is contained in:
Anthony MOI
2020-11-24 13:55:59 -05:00
committed by Anthony MOI
parent 5c35fafc44
commit 3eb7ef6d0a
6 changed files with 134 additions and 5 deletions

View File

@@ -115,7 +115,7 @@ macro_rules! getter {
} else {
unreachable!()
}
}}
}};
}
macro_rules! setter {

View File

@@ -179,6 +179,45 @@ impl PyPreTokenizer {
}
}
macro_rules! getter {
($self: ident, $variant: ident, $($name: tt)+) => {{
let super_ = $self.as_ref();
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref pretok)) =
*single.read().unwrap() {
pretok.$($name)+
} else {
unreachable!()
}
} else {
unreachable!()
}
}};
}
macro_rules! setter {
($self: ident, $variant: ident, $name: ident, $value: expr) => {{
let super_ = $self.as_ref();
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref mut pretok)) =
*single.write().unwrap()
{
pretok.$name = $value;
}
}
}};
($self: ident, $variant: ident, @$name: ident, $value: expr) => {{
let super_ = $self.as_ref();
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref mut pretok)) =
*single.write().unwrap()
{
pretok.$name($value);
}
}
}};
}
/// ByteLevel PreTokenizer
///
/// This pre-tokenizer takes care of replacing all bytes of the given string
@@ -193,6 +232,16 @@ impl PyPreTokenizer {
pub struct PyByteLevel {}
#[pymethods]
impl PyByteLevel {
#[getter]
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
getter!(self_, ByteLevel, add_prefix_space)
}
#[setter]
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
setter!(self_, ByteLevel, add_prefix_space, add_prefix_space);
}
#[new]
#[args(add_prefix_space = "true")]
fn new(add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
@@ -297,6 +346,16 @@ impl PySplit {
pub struct PyCharDelimiterSplit {}
#[pymethods]
impl PyCharDelimiterSplit {
#[getter]
fn get_delimiter(self_: PyRef<Self>) -> String {
getter!(self_, Delimiter, delimiter.to_string())
}
#[setter]
fn set_delimiter(self_: PyRef<Self>, delimiter: PyChar) {
setter!(self_, Delimiter, delimiter, delimiter.0);
}
#[new]
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
Ok((
@@ -384,6 +443,26 @@ impl PySequence {
pub struct PyMetaspace {}
#[pymethods]
impl PyMetaspace {
#[getter]
fn get_replacement(self_: PyRef<Self>) -> String {
getter!(self_, Metaspace, get_replacement().to_string())
}
#[setter]
fn set_replacement(self_: PyRef<Self>, replacement: PyChar) {
setter!(self_, Metaspace, @set_replacement, replacement.0);
}
#[getter]
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, add_prefix_space)
}
#[setter]
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
}
#[new]
#[args(replacement = "PyChar('▁')", add_prefix_space = "true")]
fn new(replacement: PyChar, add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
@@ -410,6 +489,16 @@ impl PyMetaspace {
pub struct PyDigits {}
#[pymethods]
impl PyDigits {
#[getter]
fn get_individual_digits(self_: PyRef<Self>) -> bool {
getter!(self_, Digits, individual_digits)
}
#[setter]
fn set_individual_digits(self_: PyRef<Self>, individual_digits: bool) {
setter!(self_, Digits, individual_digits, individual_digits);
}
#[new]
#[args(individual_digits = false)]
fn new(individual_digits: bool) -> PyResult<(Self, PyPreTokenizer)> {

View File

@@ -30,6 +30,15 @@ class TestByteLevel:
assert isinstance(ByteLevel.alphabet(), list)
assert len(ByteLevel.alphabet()) == 256
def test_can_modify(self):
pretok = ByteLevel(add_prefix_space=False)
assert pretok.add_prefix_space == False
# Modify these
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
class TestSplit:
def test_instantiate(self):
@@ -82,6 +91,18 @@ class TestMetaspace:
assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
def test_can_modify(self):
pretok = Metaspace(replacement="$", add_prefix_space=False)
assert pretok.replacement == "$"
assert pretok.add_prefix_space == False
# Modify these
pretok.replacement = "%"
assert pretok.replacement == "%"
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
class TestCharDelimiterSplit:
def test_instantiate(self):
@@ -92,6 +113,14 @@ class TestCharDelimiterSplit:
assert isinstance(CharDelimiterSplit(" "), CharDelimiterSplit)
assert isinstance(pickle.loads(pickle.dumps(CharDelimiterSplit("-"))), CharDelimiterSplit)
def test_can_modify(self):
pretok = CharDelimiterSplit("@")
assert pretok.delimiter == "@"
# Modify these
pretok.delimiter = "!"
assert pretok.delimiter == "!"
class TestPunctuation:
def test_instantiate(self):
@@ -138,6 +167,14 @@ class TestDigits:
assert isinstance(Digits(False), Digits)
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
def test_can_modify(self):
pretok = Digits(individual_digits=False)
assert pretok.individual_digits == False
# Modify these
pretok.individual_digits = True
assert pretok.individual_digits == True
class TestUnicodeScripts:
def test_instantiate(self):

View File

@@ -45,12 +45,13 @@ lazy_static! {
/// of all the required processing steps to transform a UTF-8 string as needed before and after the
/// BPE model does its job.
#[serde(tag = "type")]
#[non_exhaustive]
pub struct ByteLevel {
/// Whether to add a leading space to the first word. This allows to treat the leading word
/// just as any other word.
add_prefix_space: bool,
pub add_prefix_space: bool,
/// Whether the post processing step should trim offsets to avoid including whitespaces.
trim_offsets: bool,
pub trim_offsets: bool,
}
impl Default for ByteLevel {
fn default() -> Self {

View File

@@ -4,8 +4,9 @@ use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterB
#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "type")]
#[non_exhaustive]
pub struct CharDelimiterSplit {
delimiter: char,
pub delimiter: char,
}
impl CharDelimiterSplit {

View File

@@ -6,8 +6,9 @@ use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterB
/// Pre tokenizes the numbers into single tokens. If individual_digits is set
/// to true, then all digits are splitted into individual tokens.
#[serde(tag = "type")]
#[non_exhaustive]
pub struct Digits {
individual_digits: bool,
pub individual_digits: bool,
}
impl Digits {