mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 13:18:31 +00:00
Implement __new__ on Decoders
Allow decoders to be initialized from python using the class constructor.
This commit is contained in:
@@ -26,25 +26,25 @@ impl Decoder {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=Decoder)]
|
||||
pub struct ByteLevel {}
|
||||
#[pymethods]
|
||||
impl ByteLevel {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<Decoder> {
|
||||
Ok(Decoder {
|
||||
#[new]
|
||||
fn new(obj: &PyRawObject) -> PyResult<()> {
|
||||
Ok(obj.init(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoders::byte_level::ByteLevel::new(false))),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=Decoder)]
|
||||
pub struct WordPiece {}
|
||||
#[pymethods]
|
||||
impl WordPiece {
|
||||
#[staticmethod]
|
||||
#[new]
|
||||
#[args(kwargs="**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut prefix = String::from("##");
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
@@ -53,19 +53,19 @@ impl WordPiece {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Decoder {
|
||||
Ok(obj.init(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=Decoder)]
|
||||
pub struct Metaspace {}
|
||||
#[pymethods]
|
||||
impl Metaspace {
|
||||
#[staticmethod]
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut replacement = '▁';
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
@@ -85,22 +85,22 @@ impl Metaspace {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Decoder {
|
||||
Ok(obj.init(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoders::metaspace::Metaspace::new(
|
||||
replacement,
|
||||
add_prefix_space,
|
||||
))),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
#[pyclass(extends=Decoder)]
|
||||
pub struct BPEDecoder {}
|
||||
#[pymethods]
|
||||
impl BPEDecoder {
|
||||
#[staticmethod]
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut suffix = String::from("</w");
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
@@ -113,9 +113,9 @@ impl BPEDecoder {
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Decoder {
|
||||
Ok(obj.init(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoders::bpe::BPEDecoder::new(suffix))),
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,8 +14,7 @@ class Decoder:
|
||||
class ByteLevel:
|
||||
""" ByteLevel Decoder """
|
||||
|
||||
@staticmethod
|
||||
def new() -> Decoder:
|
||||
def __init__(self) -> None:
|
||||
""" Instantiate a new ByteLevel Decoder """
|
||||
pass
|
||||
|
||||
@@ -23,7 +22,7 @@ class WordPiece:
|
||||
""" WordPiece Decoder """
|
||||
|
||||
@staticmethod
|
||||
def new(prefix: str="##") -> Decoder:
|
||||
def __init__(self, prefix: str = "##") -> Decoder:
|
||||
""" Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
@@ -35,9 +34,7 @@ class WordPiece:
|
||||
class Metaspace:
|
||||
""" Metaspace decoder """
|
||||
|
||||
@staticmethod
|
||||
def new(replacement: str="▁",
|
||||
add_prefix_space: bool=True) -> Decoder:
|
||||
def __init__(self, replacement: str = "▁", add_prefix_space: bool = True) -> None:
|
||||
""" Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
@@ -54,8 +51,7 @@ class Metaspace:
|
||||
class BPEDecoder:
|
||||
""" BPEDecoder """
|
||||
|
||||
@staticmethod
|
||||
def new(suffix: str="</w>") -> Decoder:
|
||||
def __init__(self, suffix: str = "</w>") -> None:
|
||||
""" Instantiate a new BPEDecoder
|
||||
|
||||
Args:
|
||||
|
||||
@@ -48,7 +48,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
(sep_token, sep_token_id),
|
||||
(cls_token, cls_token_id)
|
||||
)
|
||||
tokenizer.decoders = decoders.WordPiece.new(prefix=wordpieces_prefix)
|
||||
tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix)
|
||||
|
||||
parameters = {
|
||||
"model": "BertWordPiece",
|
||||
|
||||
@@ -46,7 +46,7 @@ class ByteLevelBPETokenizer(BaseTokenizer):
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.ByteLevel.new()
|
||||
tokenizer.decoder = decoders.ByteLevel()
|
||||
|
||||
parameters = {
|
||||
"model": "ByteLevelBPE",
|
||||
|
||||
@@ -52,7 +52,7 @@ class CharBPETokenizer(BaseTokenizer):
|
||||
tokenizer.normalizer = normalizers[0]
|
||||
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
|
||||
tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)
|
||||
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
|
||||
|
||||
parameters = {
|
||||
"model": "BPE",
|
||||
|
||||
@@ -31,8 +31,9 @@ class SentencePieceBPETokenizer(BaseTokenizer):
|
||||
tokenizer.normalizer = NFKC()
|
||||
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(replacement=replacement,
|
||||
add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.Metaspace.new(replacement=replacement,
|
||||
add_prefix_space=add_prefix_space)
|
||||
tokenizer.decoder = decoders.Metaspace(
|
||||
replacement=replacement, add_prefix_space=add_prefix_space
|
||||
)
|
||||
|
||||
parameters = {
|
||||
"model": "SentencePieceBPE",
|
||||
|
||||
Reference in New Issue
Block a user