mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-09 22:28:29 +00:00
Merge pull request #147 from huggingface/wordpiece-cleanup
Wordpiece Decoder cleanup
This commit is contained in:
4
bindings/node/lib/bindings/decoders.d.ts
vendored
4
bindings/node/lib/bindings/decoders.d.ts
vendored
@@ -13,8 +13,10 @@ export function byteLevelDecoder(): Decoder;
|
|||||||
/**
|
/**
|
||||||
* Instantiate a new WordPiece Decoder
|
* Instantiate a new WordPiece Decoder
|
||||||
* @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
|
* @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
|
||||||
|
* @param [cleanup=true] Whether to cleanup some tokenization artifacts.
|
||||||
|
* Mainly spaces before punctuation, and some abbreviated english forms.
|
||||||
*/
|
*/
|
||||||
export function wordPieceDecoder(prefix?: string): Decoder;
|
export function wordPieceDecoder(prefix?: string, cleanup?: boolean): Decoder;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiate a new Metaspace
|
* Instantiate a new Metaspace
|
||||||
|
|||||||
@@ -30,19 +30,25 @@ fn byte_level(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
|||||||
Ok(decoder)
|
Ok(decoder)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// wordpiece(prefix: String = "##")
|
/// wordpiece(prefix: String = "##", cleanup: bool)
|
||||||
fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
|
||||||
let mut prefix = String::from("##");
|
let mut prefix = String::from("##");
|
||||||
if let Some(args) = cx.argument_opt(0) {
|
if let Some(args) = cx.argument_opt(0) {
|
||||||
prefix = args.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
|
prefix = args.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
|
||||||
}
|
}
|
||||||
|
let mut cleanup = true;
|
||||||
|
if let Some(args) = cx.argument_opt(1) {
|
||||||
|
cleanup = args.downcast::<JsBoolean>().or_throw(&mut cx)?.value();
|
||||||
|
}
|
||||||
|
|
||||||
let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
|
let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
|
||||||
let guard = cx.lock();
|
let guard = cx.lock();
|
||||||
decoder
|
decoder
|
||||||
.borrow_mut(&guard)
|
.borrow_mut(&guard)
|
||||||
.decoder
|
.decoder
|
||||||
.to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix)));
|
.to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
|
||||||
|
prefix, cleanup,
|
||||||
|
)));
|
||||||
Ok(decoder)
|
Ok(decoder)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,18 +43,24 @@ pub struct WordPiece {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl WordPiece {
|
impl WordPiece {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs="**")]
|
#[args(kwargs = "**")]
|
||||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||||
let mut prefix = String::from("##");
|
let mut prefix = String::from("##");
|
||||||
|
let mut cleanup = true;
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
if let Some(kwargs) = kwargs {
|
||||||
if let Some(p) = kwargs.get_item("prefix") {
|
if let Some(p) = kwargs.get_item("prefix") {
|
||||||
prefix = p.extract()?;
|
prefix = p.extract()?;
|
||||||
}
|
}
|
||||||
|
if let Some(c) = kwargs.get_item("cleanup") {
|
||||||
|
cleanup = c.extract()?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(obj.init(Decoder {
|
Ok(obj.init(Decoder {
|
||||||
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
|
decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
|
||||||
|
prefix, cleanup,
|
||||||
|
))),
|
||||||
}))
|
}))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,12 +22,15 @@ class WordPiece(Decoder):
|
|||||||
""" WordPiece Decoder """
|
""" WordPiece Decoder """
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __init__(self, prefix: str = "##") -> Decoder:
|
def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
|
||||||
""" Instantiate a new WordPiece Decoder
|
""" Instantiate a new WordPiece Decoder
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
prefix: str:
|
prefix: str:
|
||||||
The prefix to use for subwords that are not a beginning-of-word
|
The prefix to use for subwords that are not a beginning-of-word
|
||||||
|
cleanup: bool:
|
||||||
|
Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
|
||||||
|
and some abbreviated english forms.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,17 @@
|
|||||||
use crate::tokenizer::{Decoder, Result};
|
use crate::tokenizer::{Decoder, Result};
|
||||||
|
|
||||||
|
/// The WordPiece decoder takes care of decoding a list of wordpiece tokens
|
||||||
|
/// back into a readable string.
|
||||||
pub struct WordPiece {
|
pub struct WordPiece {
|
||||||
|
/// The prefix to be used for continuing subwords
|
||||||
prefix: String,
|
prefix: String,
|
||||||
|
/// Whether to cleanup some tokenization artifacts (spaces before punctuation, ...)
|
||||||
|
cleanup: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl WordPiece {
|
impl WordPiece {
|
||||||
pub fn new(prefix: String) -> Self {
|
pub fn new(prefix: String, cleanup: bool) -> Self {
|
||||||
Self { prefix }
|
Self { prefix, cleanup }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -14,12 +19,29 @@ impl Default for WordPiece {
|
|||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
prefix: String::from("##"),
|
prefix: String::from("##"),
|
||||||
|
cleanup: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Decoder for WordPiece {
|
impl Decoder for WordPiece {
|
||||||
fn decode(&self, tokens: Vec<String>) -> Result<String> {
|
fn decode(&self, tokens: Vec<String>) -> Result<String> {
|
||||||
Ok(tokens.join(" ").replace(&format!(" {}", self.prefix), ""))
|
let mut output = tokens.join(" ").replace(&format!(" {}", self.prefix), "");
|
||||||
|
if self.cleanup {
|
||||||
|
output = output
|
||||||
|
.replace(" .", ".")
|
||||||
|
.replace(" ?", "?")
|
||||||
|
.replace(" !", "!")
|
||||||
|
.replace(" ,", ",")
|
||||||
|
.replace(" ' ", "'")
|
||||||
|
.replace(" n't", "n't")
|
||||||
|
.replace(" 'm", "'m")
|
||||||
|
.replace(" do not", " don't")
|
||||||
|
.replace(" 's", "'s")
|
||||||
|
.replace(" 've", "'ve")
|
||||||
|
.replace(" 're", "'re");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user