mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-16 17:18:43 +00:00
Python - Add ByteLevel PostProcessor
This commit is contained in:
@@ -62,6 +62,7 @@ fn processors(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<processors::PostProcessor>()?;
|
m.add_class::<processors::PostProcessor>()?;
|
||||||
m.add_class::<processors::BertProcessing>()?;
|
m.add_class::<processors::BertProcessing>()?;
|
||||||
m.add_class::<processors::RobertaProcessing>()?;
|
m.add_class::<processors::RobertaProcessing>()?;
|
||||||
|
m.add_class::<processors::ByteLevel>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -42,3 +42,17 @@ impl RobertaProcessing {
|
|||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass(extends=PostProcessor)]
|
||||||
|
pub struct ByteLevel {}
|
||||||
|
#[pymethods]
|
||||||
|
impl ByteLevel {
|
||||||
|
#[new]
|
||||||
|
fn new(obj: &PyRawObject) -> PyResult<()> {
|
||||||
|
Ok(obj.init(PostProcessor {
|
||||||
|
processor: Container::Owned(Box::new(tk::processors::byte_level::ByteLevel::new(
|
||||||
|
false,
|
||||||
|
))),
|
||||||
|
}))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -3,3 +3,4 @@ from .. import processors
|
|||||||
PostProcessor = processors.PostProcessor
|
PostProcessor = processors.PostProcessor
|
||||||
BertProcessing = processors.BertProcessing
|
BertProcessing = processors.BertProcessing
|
||||||
RobertaProcessing = processors.RobertaProcessing
|
RobertaProcessing = processors.RobertaProcessing
|
||||||
|
ByteLevel = processors.ByteLevel
|
||||||
|
|||||||
@@ -62,3 +62,15 @@ class RobertaProcessing(PostProcessor):
|
|||||||
PostProcessor
|
PostProcessor
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class ByteLevel(PostProcessor):
|
||||||
|
""" ByteLevel Post processing
|
||||||
|
|
||||||
|
This post-processor takes care of fixing the offsets after the BPE Model may have
|
||||||
|
produced some bad offsets while merging. This happens for any unicode character that
|
||||||
|
get split up in many byte-level characters.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init(self) -> None:
|
||||||
|
""" Instantiate a new ByteLevel """
|
||||||
|
pass
|
||||||
|
|||||||
Reference in New Issue
Block a user