mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 13:18:31 +00:00
Implement __new__ for PreTokenizers
__new__ allows PreTokenizers to be instansiated through the python constructor.
This commit is contained in:
@@ -26,13 +26,13 @@ impl PreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
pub struct ByteLevel {}
|
||||
#[pymethods]
|
||||
impl ByteLevel {
|
||||
#[staticmethod]
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
@@ -45,11 +45,11 @@ impl ByteLevel {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(PreTokenizer {
|
||||
Ok(obj.init(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::byte_level::ByteLevel::new(
|
||||
add_prefix_space,
|
||||
))),
|
||||
})
|
||||
}))
|
||||
}
|
||||
|
||||
#[staticmethod]
|
||||
@@ -61,66 +61,66 @@ impl ByteLevel {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
pub struct Whitespace {}
|
||||
#[pymethods]
|
||||
impl Whitespace {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<PreTokenizer> {
|
||||
Ok(PreTokenizer {
|
||||
#[new]
|
||||
fn new(obj: &PyRawObject) -> PyResult<()> {
|
||||
Ok(obj.init(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
pub struct WhitespaceSplit {}
|
||||
#[pymethods]
|
||||
impl WhitespaceSplit {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<PreTokenizer> {
|
||||
Ok(PreTokenizer {
|
||||
#[new]
|
||||
fn new(obj: &PyRawObject) -> PyResult<()> {
|
||||
Ok(obj.init(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit)),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
pub struct CharDelimiterSplit {}
|
||||
#[pymethods]
|
||||
impl CharDelimiterSplit {
|
||||
#[staticmethod]
|
||||
pub fn new(delimiter: &str) -> PyResult<PreTokenizer> {
|
||||
#[new]
|
||||
pub fn new(obj: &PyRawObject, delimiter: &str) -> PyResult<()> {
|
||||
let chr_delimiter = delimiter.chars().nth(0).ok_or(exceptions::Exception::py_err(
|
||||
"delimiter must be a single character",
|
||||
))?;
|
||||
Ok(PreTokenizer{
|
||||
Ok(obj.init(PreTokenizer{
|
||||
pretok:Container::Owned(Box::new(
|
||||
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(chr_delimiter)
|
||||
))
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
pub struct BertPreTokenizer {}
|
||||
#[pymethods]
|
||||
impl BertPreTokenizer {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<PreTokenizer> {
|
||||
Ok(PreTokenizer {
|
||||
#[new]
|
||||
fn new(obj: &PyRawObject) -> PyResult<()> {
|
||||
Ok(obj.init(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer)),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
pub struct Metaspace {}
|
||||
#[pymethods]
|
||||
impl Metaspace {
|
||||
#[staticmethod]
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut replacement = '▁';
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
@@ -140,12 +140,12 @@ impl Metaspace {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(PreTokenizer {
|
||||
Ok(obj.init(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new(
|
||||
replacement,
|
||||
add_prefix_space,
|
||||
))),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -37,7 +37,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
strip_accents=strip_accents,
|
||||
lowercase=lowercase,
|
||||
)
|
||||
tokenizer.pre_tokenizer = BertPreTokenizer.new()
|
||||
tokenizer.pre_tokenizer = BertPreTokenizer()
|
||||
|
||||
if add_special_tokens and vocab_file is not None:
|
||||
sep_token_id = tokenizer.token_to_id(sep_token)
|
||||
|
||||
@@ -45,13 +45,12 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
|
||||
add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
parameters = {
|
||||
"model": "ByteLevelBPE",
|
||||
"add_prefix_space": add_prefix_space,
|
||||
}
|
||||
parameters = {"model": "ByteLevelBPE", "add_prefix_space": add_prefix_space}
|
||||
|
||||
super().__init__(tokenizer, parameters)
|
||||
|
||||
|
||||
@@ -51,7 +51,7 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
else:
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
|
||||
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
|
||||
|
||||
parameters = {
|
||||
|
||||
@@ -28,9 +28,11 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
|
||||
tokenizer.add_special_tokens([ unk_token ])
|
||||
|
||||
|
||||
tokenizer.normalizer = NFKC()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(replacement=replacement,
|
||||
add_prefix_space=add_prefix_space)
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
|
||||
@@ -21,7 +21,7 @@ class ByteLevel(PreTokenizer):
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new(add_prefix_space: Optional[bool]=True) -> PreTokenizer:
|
||||
def __init__(self, add_prefix_space: Optional[bool] = True) -> None:
|
||||
""" Instantiate a new ByteLevel PreTokenizer
|
||||
|
||||
Args:
|
||||
@@ -50,8 +50,7 @@ class Whitespace(PreTokenizer):
|
||||
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> PreTokenizer:
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new Whitespace PreTokenizer """
|
||||
pass
|
||||
|
||||
@@ -61,8 +60,7 @@ class WhitespaceSplit(PreTokenizer):
|
||||
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> PreTokenizer:
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new WhitespaceSplit PreTokenizer """
|
||||
pass
|
||||
|
||||
@@ -73,8 +71,7 @@ class BertPreTokenizer(PreTokenizer):
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> PreTokenizer:
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new BertPreTokenizer """
|
||||
pass
|
||||
|
||||
@@ -85,9 +82,7 @@ class Metaspace(PreTokenizer):
|
||||
It then tries to split on these spaces.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new(replacement: str="▁",
|
||||
add_prefix_space: bool=True) -> PreTokenizer:
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
@@ -109,11 +104,11 @@ class CharDelimiterSplit(PreTokenizer):
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new(delimiter: str) -> PreTokenizer:
|
||||
def __init__(self, delimiter: str) -> None:
|
||||
""" Instantiate a new CharDelimiterSplit PreTokenizer
|
||||
|
||||
Args:
|
||||
delimiter: str:
|
||||
The delimiter char that will be used to split input
|
||||
"""
|
||||
pass
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user