mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 13:18:31 +00:00
Python - Bindings for Wordpiece decoder's cleanup
This commit is contained in:
@@ -43,18 +43,24 @@ pub struct WordPiece {}
|
||||
#[pymethods]
|
||||
impl WordPiece {
|
||||
#[new]
|
||||
#[args(kwargs="**")]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut prefix = String::from("##");
|
||||
let mut cleanup = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
if let Some(p) = kwargs.get_item("prefix") {
|
||||
prefix = p.extract()?;
|
||||
}
|
||||
if let Some(c) = kwargs.get_item("cleanup") {
|
||||
cleanup = c.extract()?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(obj.init(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
|
||||
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
|
||||
prefix, cleanup,
|
||||
))),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,12 +22,15 @@ class WordPiece(Decoder):
|
||||
""" WordPiece Decoder """
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, prefix: str = "##") -> Decoder:
|
||||
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
|
||||
""" Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
cleanup: bool:
|
||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
and some abbreviated english forms.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user