Merge pull request #147 from huggingface/wordpiece-cleanup

Wordpiece Decoder cleanup
2025-12-09 22:28:29 +00:00 · 2020-02-14 13:12:15 -05:00
parent c4bac6aeeb 2aa8366a14
commit 3cac26cdb2
5 changed files with 48 additions and 9 deletions
--- a/bindings/node/lib/bindings/decoders.d.ts
+++ b/bindings/node/lib/bindings/decoders.d.ts
@@ -13,8 +13,10 @@ export function byteLevelDecoder(): Decoder;
 /**
 * Instantiate a new WordPiece Decoder
 * @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
 * @param [cleanup=true] Whether to cleanup some tokenization artifacts.
 * Mainly spaces before punctuation, and some abbreviated english forms.
 */
-export function wordPieceDecoder(prefix?: string): Decoder;
+export function wordPieceDecoder(prefix?: string, cleanup?: boolean): Decoder;
 /**
 * Instantiate a new Metaspace
--- a/bindings/node/native/src/decoders.rs
+++ b/bindings/node/native/src/decoders.rs
@@ -30,19 +30,25 @@ fn byte_level(mut cx: FunctionContext) -> JsResult<JsDecoder> {
    Ok(decoder)
 }
-/// wordpiece(prefix: String = "##")
+/// wordpiece(prefix: String = "##", cleanup: bool)
 fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
    let mut prefix = String::from("##");
    if let Some(args) = cx.argument_opt(0) {
        prefix = args.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
    }
    let mut cleanup = true;
    if let Some(args) = cx.argument_opt(1) {
        cleanup = args.downcast::<JsBoolean>().or_throw(&mut cx)?.value();
    }
    let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
    let guard = cx.lock();
    decoder
        .borrow_mut(&guard)
        .decoder
-        .to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix)));
+        .to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
            prefix, cleanup,
        )));
    Ok(decoder)
 }
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -43,18 +43,24 @@ pub struct WordPiece {}
 #[pymethods]
 impl WordPiece {
    #[new]
-    #[args(kwargs="**")]
+    #[args(kwargs = "**")]
    fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
        let mut prefix = String::from("##");
        let mut cleanup = true;
        if let Some(kwargs) = kwargs {
            if let Some(p) = kwargs.get_item("prefix") {
                prefix = p.extract()?;
            }
            if let Some(c) = kwargs.get_item("cleanup") {
                cleanup = c.extract()?;
            }
        }
        Ok(obj.init(Decoder {
-            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
+            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
                prefix, cleanup,
            ))),
        }))
    }
 }
--- a/bindings/python/tokenizers/decoders/init.pyi
+++ b/bindings/python/tokenizers/decoders/init.pyi
@@ -22,12 +22,15 @@ class WordPiece(Decoder):
    """ WordPiece Decoder """
    @staticmethod
-    def __init__(self, prefix: str = "##") -> Decoder:
+    def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
        """ Instantiate a new WordPiece Decoder
        Args:
            prefix: str:
                The prefix to use for subwords that are not a beginning-of-word
            cleanup: bool:
                Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
                and some abbreviated english forms.
        """
        pass
--- a/tokenizers/src/decoders/wordpiece.rs
+++ b/tokenizers/src/decoders/wordpiece.rs
@@ -1,12 +1,17 @@
 use crate::tokenizer::{Decoder, Result};
 /// The WordPiece decoder takes care of decoding a list of wordpiece tokens
 /// back into a readable string.
 pub struct WordPiece {
    /// The prefix to be used for continuing subwords
    prefix: String,
    /// Whether to cleanup some tokenization artifacts (spaces before punctuation, ...)
    cleanup: bool,
 }
 impl WordPiece {
-    pub fn new(prefix: String) -> Self {
+    pub fn new(prefix: String, cleanup: bool) -> Self {
-        Self { prefix }
+        Self { prefix, cleanup }
    }
 }
@@ -14,12 +19,29 @@ impl Default for WordPiece {
    fn default() -> Self {
        Self {
            prefix: String::from("##"),
            cleanup: true,
        }
    }
 }
 impl Decoder for WordPiece {
    fn decode(&self, tokens: Vec<String>) -> Result<String> {
-        Ok(tokens.join(" ").replace(&format!(" {}", self.prefix), ""))
+        let mut output = tokens.join(" ").replace(&format!(" {}", self.prefix), "");
        if self.cleanup {
            output = output
                .replace(" .", ".")
                .replace(" ?", "?")
                .replace(" !", "!")
                .replace(" ,", ",")
                .replace(" ' ", "'")
                .replace(" n't", "n't")
                .replace(" 'm", "'m")
                .replace(" do not", " don't")
                .replace(" 's", "'s")
                .replace(" 've", "'ve")
                .replace(" 're", "'re");
        }
        Ok(output)
    }
 }