mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-02 23:39:14 +00:00
WordPiece decoder with customizable prefix
This commit is contained in:
@ -42,9 +42,17 @@ pub struct WordPiece {}
|
||||
#[pymethods]
|
||||
impl WordPiece {
|
||||
#[staticmethod]
|
||||
fn new() -> PyResult<Decoder> {
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
|
||||
let mut prefix = String::from("##");
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
if let Some(p) = kwargs.get_item("prefix") {
|
||||
prefix = p.extract()?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece)),
|
||||
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -24,6 +24,11 @@ class WordPiece:
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new() -> Decoder:
|
||||
""" Instantiate a new WordPiece Decoder """
|
||||
def new(prefix: str="##") -> Decoder:
|
||||
""" Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
"""
|
||||
pass
|
||||
|
Reference in New Issue
Block a user