Add content to Strip decoder to allow decoding mid tokens. (#1199)

* Add `content` to Strip decoder to allow decoding mid tokens.

* Stub.

* Clippy.
This commit is contained in:
Nicolas Patry
2023-03-24 10:14:49 +01:00
committed by GitHub
parent 8a6a8dc9d5
commit 3aaf4946b3
7 changed files with 78 additions and 39 deletions

View File

@@ -226,7 +226,7 @@ class Strip(Decoder):
Strips n left characters of each token, or n right characters of each token
"""
def __init__(self, left=0, right=0):
def __init__(self, content, left=0, right=0):
pass
def decode(self, tokens):
"""

View File

@@ -261,34 +261,44 @@ impl PyFuseDec {
/// Strip normalizer
/// Strips n left characters of each token, or n right characters of each token
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "Strip")]
#[pyo3(text_signature = "(self, left=0, right=0)")]
#[pyo3(text_signature = "(self, content, left=0, right=0)")]
pub struct PyStrip {}
#[pymethods]
impl PyStrip {
#[getter]
fn get_left(self_: PyRef<Self>) -> usize {
getter!(self_, Strip, left)
fn get_start(self_: PyRef<Self>) -> usize {
getter!(self_, Strip, start)
}
#[setter]
fn set_left(self_: PyRef<Self>, left: usize) {
setter!(self_, Strip, left, left)
fn set_start(self_: PyRef<Self>, start: usize) {
setter!(self_, Strip, start, start)
}
#[getter]
fn get_right(self_: PyRef<Self>) -> usize {
getter!(self_, Strip, right)
fn get_stop(self_: PyRef<Self>) -> usize {
getter!(self_, Strip, stop)
}
#[setter]
fn set_right(self_: PyRef<Self>, right: usize) {
setter!(self_, Strip, right, right)
fn set_stop(self_: PyRef<Self>, stop: usize) {
setter!(self_, Strip, stop, stop)
}
#[getter]
fn get_content(self_: PyRef<Self>) -> char {
getter!(self_, Strip, content)
}
#[setter]
fn set_content(self_: PyRef<Self>, content: char) {
setter!(self_, Strip, content, content)
}
#[new]
#[pyo3(signature = (left=0, right=0))]
fn new(left: usize, right: usize) -> (Self, PyDecoder) {
(PyStrip {}, Strip::new(left, right).into())
#[pyo3(signature = (content=' ', left=0, right=0))]
fn new(content: char, left: usize, right: usize) -> (Self, PyDecoder) {
(PyStrip {}, Strip::new(content, left, right).into())
}
}

View File

@@ -111,13 +111,13 @@ class TestFuse:
class TestStrip:
def test_instantiate(self):
assert Strip(left=0, right=0) is not None
assert isinstance(Strip(left=0, right=0), Decoder)
assert isinstance(Strip(left=0, right=0), Strip)
assert isinstance(pickle.loads(pickle.dumps(Strip(left=0, right=0))), Strip)
assert isinstance(Strip(content="_", left=0, right=0), Decoder)
assert isinstance(Strip(content="_", left=0, right=0), Strip)
assert isinstance(pickle.loads(pickle.dumps(Strip(content="_", left=0, right=0))), Strip)
def test_decoding(self):
decoder = Strip(left=1, right=0)
assert decoder.decode(["My", " na", "me"]) == "ynae"
decoder = Strip(content="_", left=1, right=0)
assert decoder.decode(["_My", " na", "me", " _-", "__-"]) == "My name _-_-"
class TestMetaspace: