Add BPEDecoder

This commit is contained in:
Anthony MOI
2020-01-07 19:56:49 -05:00
parent 5bc1e2ee05
commit 243a45af40
6 changed files with 67 additions and 0 deletions

View File

@@ -93,6 +93,31 @@ impl Metaspace {
}
}
#[pyclass]
pub struct BPEDecoder {}
#[pymethods]
impl BPEDecoder {
#[staticmethod]
#[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
let mut suffix = String::from("</w");
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"suffix" => suffix = value.extract()?,
_ => println!("Ignored unknown kwarg option {}", key),
}
}
}
Ok(Decoder {
decoder: Container::Owned(Box::new(tk::decoders::bpe::BPEDecoder::new(suffix))),
})
}
}
struct PyDecoder {
class: PyObject,
}

View File

@@ -50,6 +50,7 @@ fn decoders(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<decoders::ByteLevel>()?;
m.add_class::<decoders::WordPiece>()?;
m.add_class::<decoders::Metaspace>()?;
m.add_class::<decoders::BPEDecoder>()?;
Ok(())
}

View File

@@ -4,3 +4,4 @@ Decoder = decoders.Decoder
ByteLevel = decoders.ByteLevel
WordPiece = decoders.WordPiece
Metaspace = decoders.Metaspace
BPEDecoder = decoders.BPEDecoder

View File

@@ -50,3 +50,17 @@ class Metaspace:
lets us treat `hello` exactly like `say hello`.
"""
pass
class BPEDecoder:
""" BPEDecoder """
@staticmethod
def new(suffix: str="</w>") -> Decoder:
""" Instantiate a new BPEDecoder
Args:
suffix: str:
The suffix that was used to caracterize an end-of-word. This suffix will
be replaced by whitespaces during the decoding
"""
pass

View File

@@ -0,0 +1,25 @@
use crate::tokenizer::{Decoder, Result};
/// Allows decoding Original BPE by joining all the tokens and then replacing
/// the suffix used to identify end-of-words by whitespaces
pub struct BPEDecoder {
suffix: String,
}
impl BPEDecoder {
pub fn new(suffix: String) -> Self {
BPEDecoder { suffix }
}
}
impl Default for BPEDecoder {
fn default() -> Self {
BPEDecoder::new("</w>".into())
}
}
impl Decoder for BPEDecoder {
fn decode(&self, tokens: Vec<String>) -> Result<String> {
Ok(tokens.join("").replace(&self.suffix, " "))
}
}

View File

@@ -1,3 +1,4 @@
pub mod bpe;
pub mod wordpiece;
// Re-export these as decoders