mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 21:58:18 +00:00
Merge pull request #147 from huggingface/wordpiece-cleanup
Wordpiece Decoder cleanup
This commit is contained in:
4
bindings/node/lib/bindings/decoders.d.ts
vendored
4
bindings/node/lib/bindings/decoders.d.ts
vendored
@@ -13,8 +13,10 @@ export function byteLevelDecoder(): Decoder;
|
||||
/**
|
||||
* Instantiate a new WordPiece Decoder
|
||||
* @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
|
||||
* @param [cleanup=true] Whether to cleanup some tokenization artifacts.
|
||||
* Mainly spaces before punctuation, and some abbreviated english forms.
|
||||
*/
|
||||
export function wordPieceDecoder(prefix?: string): Decoder;
|
||||
export function wordPieceDecoder(prefix?: string, cleanup?: boolean): Decoder;
|
||||
|
||||
/**
|
||||
* Instantiate a new Metaspace
|
||||
|
||||
@@ -30,19 +30,25 @@ fn byte_level(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
||||
Ok(decoder)
|
||||
}
|
||||
|
||||
/// wordpiece(prefix: String = "##")
|
||||
/// wordpiece(prefix: String = "##", cleanup: bool)
|
||||
fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
||||
let mut prefix = String::from("##");
|
||||
if let Some(args) = cx.argument_opt(0) {
|
||||
prefix = args.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
|
||||
}
|
||||
let mut cleanup = true;
|
||||
if let Some(args) = cx.argument_opt(1) {
|
||||
cleanup = args.downcast::<JsBoolean>().or_throw(&mut cx)?.value();
|
||||
}
|
||||
|
||||
let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
decoder
|
||||
.borrow_mut(&guard)
|
||||
.decoder
|
||||
.to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix)));
|
||||
.to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
|
||||
prefix, cleanup,
|
||||
)));
|
||||
Ok(decoder)
|
||||
}
|
||||
|
||||
|
||||
@@ -43,18 +43,24 @@ pub struct WordPiece {}
|
||||
#[pymethods]
|
||||
impl WordPiece {
|
||||
#[new]
|
||||
#[args(kwargs="**")]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut prefix = String::from("##");
|
||||
let mut cleanup = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
if let Some(p) = kwargs.get_item("prefix") {
|
||||
prefix = p.extract()?;
|
||||
}
|
||||
if let Some(c) = kwargs.get_item("cleanup") {
|
||||
cleanup = c.extract()?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(obj.init(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
|
||||
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
|
||||
prefix, cleanup,
|
||||
))),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,12 +22,15 @@ class WordPiece(Decoder):
|
||||
""" WordPiece Decoder """
|
||||
|
||||
@staticmethod
|
||||
def __init__(self, prefix: str = "##") -> Decoder:
|
||||
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
|
||||
""" Instantiate a new WordPiece Decoder
|
||||
|
||||
Args:
|
||||
prefix: str:
|
||||
The prefix to use for subwords that are not a beginning-of-word
|
||||
cleanup: bool:
|
||||
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||
and some abbreviated english forms.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
use crate::tokenizer::{Decoder, Result};
|
||||
|
||||
/// The WordPiece decoder takes care of decoding a list of wordpiece tokens
|
||||
/// back into a readable string.
|
||||
pub struct WordPiece {
|
||||
/// The prefix to be used for continuing subwords
|
||||
prefix: String,
|
||||
/// Whether to cleanup some tokenization artifacts (spaces before punctuation, ...)
|
||||
cleanup: bool,
|
||||
}
|
||||
|
||||
impl WordPiece {
|
||||
pub fn new(prefix: String) -> Self {
|
||||
Self { prefix }
|
||||
pub fn new(prefix: String, cleanup: bool) -> Self {
|
||||
Self { prefix, cleanup }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,12 +19,29 @@ impl Default for WordPiece {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
prefix: String::from("##"),
|
||||
cleanup: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Decoder for WordPiece {
|
||||
fn decode(&self, tokens: Vec<String>) -> Result<String> {
|
||||
Ok(tokens.join(" ").replace(&format!(" {}", self.prefix), ""))
|
||||
let mut output = tokens.join(" ").replace(&format!(" {}", self.prefix), "");
|
||||
if self.cleanup {
|
||||
output = output
|
||||
.replace(" .", ".")
|
||||
.replace(" ?", "?")
|
||||
.replace(" !", "!")
|
||||
.replace(" ,", ",")
|
||||
.replace(" ' ", "'")
|
||||
.replace(" n't", "n't")
|
||||
.replace(" 'm", "'m")
|
||||
.replace(" do not", " don't")
|
||||
.replace(" 's", "'s")
|
||||
.replace(" 've", "'ve")
|
||||
.replace(" 're", "'re");
|
||||
}
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user