Merge pull request #147 from huggingface/wordpiece-cleanup

Wordpiece Decoder cleanup
This commit is contained in:
MOI Anthony
2020-02-14 13:12:15 -05:00
committed by GitHub
5 changed files with 48 additions and 9 deletions

View File

@@ -13,8 +13,10 @@ export function byteLevelDecoder(): Decoder;
/**
* Instantiate a new WordPiece Decoder
* @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
* @param [cleanup=true] Whether to cleanup some tokenization artifacts.
* Mainly spaces before punctuation, and some abbreviated english forms.
*/
export function wordPieceDecoder(prefix?: string): Decoder;
export function wordPieceDecoder(prefix?: string, cleanup?: boolean): Decoder;
/**
* Instantiate a new Metaspace

View File

@@ -30,19 +30,25 @@ fn byte_level(mut cx: FunctionContext) -> JsResult<JsDecoder> {
Ok(decoder)
}
/// wordpiece(prefix: String = "##")
/// wordpiece(prefix: String = "##", cleanup: bool)
fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
let mut prefix = String::from("##");
if let Some(args) = cx.argument_opt(0) {
prefix = args.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
}
let mut cleanup = true;
if let Some(args) = cx.argument_opt(1) {
cleanup = args.downcast::<JsBoolean>().or_throw(&mut cx)?.value();
}
let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
let guard = cx.lock();
decoder
.borrow_mut(&guard)
.decoder
.to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix)));
.to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
prefix, cleanup,
)));
Ok(decoder)
}

View File

@@ -43,18 +43,24 @@ pub struct WordPiece {}
#[pymethods]
impl WordPiece {
#[new]
#[args(kwargs="**")]
#[args(kwargs = "**")]
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut prefix = String::from("##");
let mut cleanup = true;
if let Some(kwargs) = kwargs {
if let Some(p) = kwargs.get_item("prefix") {
prefix = p.extract()?;
}
if let Some(c) = kwargs.get_item("cleanup") {
cleanup = c.extract()?;
}
}
Ok(obj.init(Decoder {
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
prefix, cleanup,
))),
}))
}
}

View File

@@ -22,12 +22,15 @@ class WordPiece(Decoder):
""" WordPiece Decoder """
@staticmethod
def __init__(self, prefix: str = "##") -> Decoder:
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
""" Instantiate a new WordPiece Decoder
Args:
prefix: str:
The prefix to use for subwords that are not a beginning-of-word
cleanup: bool:
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
and some abbreviated english forms.
"""
pass

View File

@@ -1,12 +1,17 @@
use crate::tokenizer::{Decoder, Result};
/// The WordPiece decoder takes care of decoding a list of wordpiece tokens
/// back into a readable string.
pub struct WordPiece {
/// The prefix to be used for continuing subwords
prefix: String,
/// Whether to cleanup some tokenization artifacts (spaces before punctuation, ...)
cleanup: bool,
}
impl WordPiece {
pub fn new(prefix: String) -> Self {
Self { prefix }
pub fn new(prefix: String, cleanup: bool) -> Self {
Self { prefix, cleanup }
}
}
@@ -14,12 +19,29 @@ impl Default for WordPiece {
fn default() -> Self {
Self {
prefix: String::from("##"),
cleanup: true,
}
}
}
impl Decoder for WordPiece {
fn decode(&self, tokens: Vec<String>) -> Result<String> {
Ok(tokens.join(" ").replace(&format!(" {}", self.prefix), ""))
let mut output = tokens.join(" ").replace(&format!(" {}", self.prefix), "");
if self.cleanup {
output = output
.replace(" .", ".")
.replace(" ?", "?")
.replace(" !", "!")
.replace(" ,", ",")
.replace(" ' ", "'")
.replace(" n't", "n't")
.replace(" 'm", "'m")
.replace(" do not", " don't")
.replace(" 's", "'s")
.replace(" 've", "'ve")
.replace(" 're", "'re");
}
Ok(output)
}
}