mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-16 17:18:43 +00:00
Python - PreTokenizers can get/set their attributes
This commit is contained in:
@@ -115,7 +115,7 @@ macro_rules! getter {
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! setter {
|
||||
|
||||
@@ -179,6 +179,45 @@ impl PyPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! getter {
|
||||
($self: ident, $variant: ident, $($name: tt)+) => {{
|
||||
let super_ = $self.as_ref();
|
||||
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
|
||||
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref pretok)) =
|
||||
*single.read().unwrap() {
|
||||
pretok.$($name)+
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
} else {
|
||||
unreachable!()
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
macro_rules! setter {
|
||||
($self: ident, $variant: ident, $name: ident, $value: expr) => {{
|
||||
let super_ = $self.as_ref();
|
||||
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
|
||||
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref mut pretok)) =
|
||||
*single.write().unwrap()
|
||||
{
|
||||
pretok.$name = $value;
|
||||
}
|
||||
}
|
||||
}};
|
||||
($self: ident, $variant: ident, @$name: ident, $value: expr) => {{
|
||||
let super_ = $self.as_ref();
|
||||
if let PyPreTokenizerTypeWrapper::Single(ref single) = super_.pretok {
|
||||
if let PyPreTokenizerWrapper::Wrapped(PreTokenizerWrapper::$variant(ref mut pretok)) =
|
||||
*single.write().unwrap()
|
||||
{
|
||||
pretok.$name($value);
|
||||
}
|
||||
}
|
||||
}};
|
||||
}
|
||||
|
||||
/// ByteLevel PreTokenizer
|
||||
///
|
||||
/// This pre-tokenizer takes care of replacing all bytes of the given string
|
||||
@@ -193,6 +232,16 @@ impl PyPreTokenizer {
|
||||
pub struct PyByteLevel {}
|
||||
#[pymethods]
|
||||
impl PyByteLevel {
|
||||
#[getter]
|
||||
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
|
||||
getter!(self_, ByteLevel, add_prefix_space)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
|
||||
setter!(self_, ByteLevel, add_prefix_space, add_prefix_space);
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[args(add_prefix_space = "true")]
|
||||
fn new(add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
@@ -297,6 +346,16 @@ impl PySplit {
|
||||
pub struct PyCharDelimiterSplit {}
|
||||
#[pymethods]
|
||||
impl PyCharDelimiterSplit {
|
||||
#[getter]
|
||||
fn get_delimiter(self_: PyRef<Self>) -> String {
|
||||
getter!(self_, Delimiter, delimiter.to_string())
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_delimiter(self_: PyRef<Self>, delimiter: PyChar) {
|
||||
setter!(self_, Delimiter, delimiter, delimiter.0);
|
||||
}
|
||||
|
||||
#[new]
|
||||
pub fn new(delimiter: PyChar) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
Ok((
|
||||
@@ -384,6 +443,26 @@ impl PySequence {
|
||||
pub struct PyMetaspace {}
|
||||
#[pymethods]
|
||||
impl PyMetaspace {
|
||||
#[getter]
|
||||
fn get_replacement(self_: PyRef<Self>) -> String {
|
||||
getter!(self_, Metaspace, get_replacement().to_string())
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_replacement(self_: PyRef<Self>, replacement: PyChar) {
|
||||
setter!(self_, Metaspace, @set_replacement, replacement.0);
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_add_prefix_space(self_: PyRef<Self>) -> bool {
|
||||
getter!(self_, Metaspace, add_prefix_space)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_add_prefix_space(self_: PyRef<Self>, add_prefix_space: bool) {
|
||||
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[args(replacement = "PyChar('▁')", add_prefix_space = "true")]
|
||||
fn new(replacement: PyChar, add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
@@ -410,6 +489,16 @@ impl PyMetaspace {
|
||||
pub struct PyDigits {}
|
||||
#[pymethods]
|
||||
impl PyDigits {
|
||||
#[getter]
|
||||
fn get_individual_digits(self_: PyRef<Self>) -> bool {
|
||||
getter!(self_, Digits, individual_digits)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_individual_digits(self_: PyRef<Self>, individual_digits: bool) {
|
||||
setter!(self_, Digits, individual_digits, individual_digits);
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[args(individual_digits = false)]
|
||||
fn new(individual_digits: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
|
||||
@@ -30,6 +30,15 @@ class TestByteLevel:
|
||||
assert isinstance(ByteLevel.alphabet(), list)
|
||||
assert len(ByteLevel.alphabet()) == 256
|
||||
|
||||
def test_can_modify(self):
|
||||
pretok = ByteLevel(add_prefix_space=False)
|
||||
|
||||
assert pretok.add_prefix_space == False
|
||||
|
||||
# Modify these
|
||||
pretok.add_prefix_space = True
|
||||
assert pretok.add_prefix_space == True
|
||||
|
||||
|
||||
class TestSplit:
|
||||
def test_instantiate(self):
|
||||
@@ -82,6 +91,18 @@ class TestMetaspace:
|
||||
assert isinstance(Metaspace(), Metaspace)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
|
||||
|
||||
def test_can_modify(self):
|
||||
pretok = Metaspace(replacement="$", add_prefix_space=False)
|
||||
|
||||
assert pretok.replacement == "$"
|
||||
assert pretok.add_prefix_space == False
|
||||
|
||||
# Modify these
|
||||
pretok.replacement = "%"
|
||||
assert pretok.replacement == "%"
|
||||
pretok.add_prefix_space = True
|
||||
assert pretok.add_prefix_space == True
|
||||
|
||||
|
||||
class TestCharDelimiterSplit:
|
||||
def test_instantiate(self):
|
||||
@@ -92,6 +113,14 @@ class TestCharDelimiterSplit:
|
||||
assert isinstance(CharDelimiterSplit(" "), CharDelimiterSplit)
|
||||
assert isinstance(pickle.loads(pickle.dumps(CharDelimiterSplit("-"))), CharDelimiterSplit)
|
||||
|
||||
def test_can_modify(self):
|
||||
pretok = CharDelimiterSplit("@")
|
||||
assert pretok.delimiter == "@"
|
||||
|
||||
# Modify these
|
||||
pretok.delimiter = "!"
|
||||
assert pretok.delimiter == "!"
|
||||
|
||||
|
||||
class TestPunctuation:
|
||||
def test_instantiate(self):
|
||||
@@ -138,6 +167,14 @@ class TestDigits:
|
||||
assert isinstance(Digits(False), Digits)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
|
||||
|
||||
def test_can_modify(self):
|
||||
pretok = Digits(individual_digits=False)
|
||||
assert pretok.individual_digits == False
|
||||
|
||||
# Modify these
|
||||
pretok.individual_digits = True
|
||||
assert pretok.individual_digits == True
|
||||
|
||||
|
||||
class TestUnicodeScripts:
|
||||
def test_instantiate(self):
|
||||
|
||||
@@ -45,12 +45,13 @@ lazy_static! {
|
||||
/// of all the required processing steps to transform a UTF-8 string as needed before and after the
|
||||
/// BPE model does its job.
|
||||
#[serde(tag = "type")]
|
||||
#[non_exhaustive]
|
||||
pub struct ByteLevel {
|
||||
/// Whether to add a leading space to the first word. This allows to treat the leading word
|
||||
/// just as any other word.
|
||||
add_prefix_space: bool,
|
||||
pub add_prefix_space: bool,
|
||||
/// Whether the post processing step should trim offsets to avoid including whitespaces.
|
||||
trim_offsets: bool,
|
||||
pub trim_offsets: bool,
|
||||
}
|
||||
impl Default for ByteLevel {
|
||||
fn default() -> Self {
|
||||
|
||||
@@ -4,8 +4,9 @@ use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterB
|
||||
|
||||
#[derive(Copy, Clone, Debug, Deserialize, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
#[non_exhaustive]
|
||||
pub struct CharDelimiterSplit {
|
||||
delimiter: char,
|
||||
pub delimiter: char,
|
||||
}
|
||||
|
||||
impl CharDelimiterSplit {
|
||||
|
||||
@@ -6,8 +6,9 @@ use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterB
|
||||
/// Pre tokenizes the numbers into single tokens. If individual_digits is set
|
||||
/// to true, then all digits are splitted into individual tokens.
|
||||
#[serde(tag = "type")]
|
||||
#[non_exhaustive]
|
||||
pub struct Digits {
|
||||
individual_digits: bool,
|
||||
pub individual_digits: bool,
|
||||
}
|
||||
|
||||
impl Digits {
|
||||
|
||||
Reference in New Issue
Block a user