mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-02 23:39:14 +00:00
Python - add Metaspace decoder
This commit is contained in:
@ -57,6 +57,41 @@ impl WordPiece {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct Metaspace {}
|
||||
#[pymethods]
|
||||
impl Metaspace {
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
|
||||
let mut replacement = '▁';
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
"replacement" => {
|
||||
let s: &str = value.extract()?;
|
||||
replacement = s.chars().nth(0).ok_or(exceptions::Exception::py_err(
|
||||
"replacement must be a character",
|
||||
))?;
|
||||
}
|
||||
"add_prefix_space" => add_prefix_space = value.extract()?,
|
||||
_ => println!("Ignored unknown kwarg option {}", key),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoder::metaspace::Metaspace::new(
|
||||
replacement,
|
||||
add_prefix_space,
|
||||
))),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct PyDecoder {
|
||||
class: PyObject,
|
||||
}
|
||||
|
@ -49,6 +49,7 @@ fn decoders(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<decoders::Decoder>()?;
|
||||
m.add_class::<decoders::ByteLevel>()?;
|
||||
m.add_class::<decoders::WordPiece>()?;
|
||||
m.add_class::<decoders::Metaspace>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -3,3 +3,4 @@ from .. import decoders
|
||||
Decoder = decoders.Decoder
|
||||
ByteLevel = decoders.ByteLevel
|
||||
WordPiece = decoders.WordPiece
|
||||
Metaspace = decoders.Metaspace
|
||||
|
@ -31,3 +31,22 @@ class WordPiece:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
"""
|
||||
pass
|
||||
|
||||
class Metaspace:
|
||||
""" Metaspace decoder """
|
||||
|
||||
@staticmethod
|
||||
def new(replacement: str="▁",
|
||||
add_prefix_space: bool=True) -> Decoder:
|
||||
""" Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
pass
|
||||
|
Reference in New Issue
Block a user