mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 11:18:29 +00:00
Add content to Strip decoder to allow decoding mid tokens. (#1199)
* Add `content` to Strip decoder to allow decoding mid tokens. * Stub. * Clippy.
This commit is contained in:
@@ -226,7 +226,7 @@ class Strip(Decoder):
|
||||
Strips n left characters of each token, or n right characters of each token
|
||||
"""
|
||||
|
||||
def __init__(self, left=0, right=0):
|
||||
def __init__(self, content, left=0, right=0):
|
||||
pass
|
||||
def decode(self, tokens):
|
||||
"""
|
||||
|
||||
@@ -261,34 +261,44 @@ impl PyFuseDec {
|
||||
/// Strip normalizer
|
||||
/// Strips n left characters of each token, or n right characters of each token
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Strip")]
|
||||
#[pyo3(text_signature = "(self, left=0, right=0)")]
|
||||
#[pyo3(text_signature = "(self, content, left=0, right=0)")]
|
||||
pub struct PyStrip {}
|
||||
#[pymethods]
|
||||
impl PyStrip {
|
||||
#[getter]
|
||||
fn get_left(self_: PyRef<Self>) -> usize {
|
||||
getter!(self_, Strip, left)
|
||||
fn get_start(self_: PyRef<Self>) -> usize {
|
||||
getter!(self_, Strip, start)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_left(self_: PyRef<Self>, left: usize) {
|
||||
setter!(self_, Strip, left, left)
|
||||
fn set_start(self_: PyRef<Self>, start: usize) {
|
||||
setter!(self_, Strip, start, start)
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_right(self_: PyRef<Self>) -> usize {
|
||||
getter!(self_, Strip, right)
|
||||
fn get_stop(self_: PyRef<Self>) -> usize {
|
||||
getter!(self_, Strip, stop)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_right(self_: PyRef<Self>, right: usize) {
|
||||
setter!(self_, Strip, right, right)
|
||||
fn set_stop(self_: PyRef<Self>, stop: usize) {
|
||||
setter!(self_, Strip, stop, stop)
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_content(self_: PyRef<Self>) -> char {
|
||||
getter!(self_, Strip, content)
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_content(self_: PyRef<Self>, content: char) {
|
||||
setter!(self_, Strip, content, content)
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (left=0, right=0))]
|
||||
fn new(left: usize, right: usize) -> (Self, PyDecoder) {
|
||||
(PyStrip {}, Strip::new(left, right).into())
|
||||
#[pyo3(signature = (content=' ', left=0, right=0))]
|
||||
fn new(content: char, left: usize, right: usize) -> (Self, PyDecoder) {
|
||||
(PyStrip {}, Strip::new(content, left, right).into())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -111,13 +111,13 @@ class TestFuse:
|
||||
class TestStrip:
|
||||
def test_instantiate(self):
|
||||
assert Strip(left=0, right=0) is not None
|
||||
assert isinstance(Strip(left=0, right=0), Decoder)
|
||||
assert isinstance(Strip(left=0, right=0), Strip)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Strip(left=0, right=0))), Strip)
|
||||
assert isinstance(Strip(content="_", left=0, right=0), Decoder)
|
||||
assert isinstance(Strip(content="_", left=0, right=0), Strip)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Strip(content="_", left=0, right=0))), Strip)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = Strip(left=1, right=0)
|
||||
assert decoder.decode(["My", " na", "me"]) == "ynae"
|
||||
decoder = Strip(content="_", left=1, right=0)
|
||||
assert decoder.decode(["_My", " na", "me", " _-", "__-"]) == "My name _-_-"
|
||||
|
||||
|
||||
class TestMetaspace:
|
||||
|
||||
Reference in New Issue
Block a user