Merge pull request #147 from huggingface/wordpiece-cleanup

Wordpiece Decoder cleanup
2025-12-08 21:58:18 +00:00 · 2020-02-14 13:12:15 -05:00
parent c4bac6aeeb 2aa8366a14
commit 3cac26cdb2
5 changed files with 48 additions and 9 deletions
--- a/bindings/node/lib/bindings/decoders.d.ts
+++ b/bindings/node/lib/bindings/decoders.d.ts
@@ -13,8 +13,10 @@ export function byteLevelDecoder(): Decoder;
 /**
 * Instantiate a new WordPiece Decoder
 * @param [prefix='##'] The prefix to use for subwords that are not a beginning-of-word
+ * @param [cleanup=true] Whether to cleanup some tokenization artifacts.
+ * Mainly spaces before punctuation, and some abbreviated english forms.
 */
-export function wordPieceDecoder(prefix?: string): Decoder;
+export function wordPieceDecoder(prefix?: string, cleanup?: boolean): Decoder;

 /**
 * Instantiate a new Metaspace
--- a/bindings/node/native/src/decoders.rs
+++ b/bindings/node/native/src/decoders.rs
@@ -30,19 +30,25 @@ fn byte_level(mut cx: FunctionContext) -> JsResult<JsDecoder> {
    Ok(decoder)
 }

-/// wordpiece(prefix: String = "##")
+/// wordpiece(prefix: String = "##", cleanup: bool)
 fn wordpiece(mut cx: FunctionContext) -> JsResult<JsDecoder> {
    let mut prefix = String::from("##");
    if let Some(args) = cx.argument_opt(0) {
        prefix = args.downcast::<JsString>().or_throw(&mut cx)?.value() as String;
    }
+    let mut cleanup = true;
+    if let Some(args) = cx.argument_opt(1) {
+        cleanup = args.downcast::<JsBoolean>().or_throw(&mut cx)?.value();
+    }

    let mut decoder = JsDecoder::new::<_, JsDecoder, _>(&mut cx, vec![])?;
    let guard = cx.lock();
    decoder
        .borrow_mut(&guard)
        .decoder
-        .to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix)));
+        .to_owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
+            prefix, cleanup,
+        )));
    Ok(decoder)
 }

--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -43,18 +43,24 @@ pub struct WordPiece {}
 #[pymethods]
 impl WordPiece {
    #[new]
-    #[args(kwargs="**")]
+    #[args(kwargs = "**")]
    fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
        let mut prefix = String::from("##");
+        let mut cleanup = true;

        if let Some(kwargs) = kwargs {
            if let Some(p) = kwargs.get_item("prefix") {
                prefix = p.extract()?;
            }
+            if let Some(c) = kwargs.get_item("cleanup") {
+                cleanup = c.extract()?;
+            }
        }

        Ok(obj.init(Decoder {
-            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
+            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
+                prefix, cleanup,
+            ))),
        }))
    }
 }
--- a/bindings/python/tokenizers/decoders/init.pyi
+++ b/bindings/python/tokenizers/decoders/init.pyi
@@ -22,12 +22,15 @@ class WordPiece(Decoder):
    """ WordPiece Decoder """

    @staticmethod
-    def __init__(self, prefix: str = "##") -> Decoder:
+    def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
        """ Instantiate a new WordPiece Decoder

        Args:
            prefix: str:
                The prefix to use for subwords that are not a beginning-of-word
+            cleanup: bool:
+                Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
+                and some abbreviated english forms.
        """
        pass

--- a/tokenizers/src/decoders/wordpiece.rs
+++ b/tokenizers/src/decoders/wordpiece.rs
@@ -1,12 +1,17 @@
 use crate::tokenizer::{Decoder, Result};

+/// The WordPiece decoder takes care of decoding a list of wordpiece tokens
+/// back into a readable string.
 pub struct WordPiece {
+    /// The prefix to be used for continuing subwords
    prefix: String,
+    /// Whether to cleanup some tokenization artifacts (spaces before punctuation, ...)
+    cleanup: bool,
 }

 impl WordPiece {
-    pub fn new(prefix: String) -> Self {
-        Self { prefix }
+    pub fn new(prefix: String, cleanup: bool) -> Self {
+        Self { prefix, cleanup }
    }
 }

@@ -14,12 +19,29 @@ impl Default for WordPiece {
    fn default() -> Self {
        Self {
            prefix: String::from("##"),
+            cleanup: true,
        }
    }
 }

 impl Decoder for WordPiece {
    fn decode(&self, tokens: Vec<String>) -> Result<String> {
-        Ok(tokens.join(" ").replace(&format!(" {}", self.prefix), ""))
+        let mut output = tokens.join(" ").replace(&format!(" {}", self.prefix), "");
+        if self.cleanup {
+            output = output
+                .replace(" .", ".")
+                .replace(" ?", "?")
+                .replace(" !", "!")
+                .replace(" ,", ",")
+                .replace(" ' ", "'")
+                .replace(" n't", "n't")
+                .replace(" 'm", "'m")
+                .replace(" do not", " don't")
+                .replace(" 's", "'s")
+                .replace(" 've", "'ve")
+                .replace(" 're", "'re");
+        }
+
+        Ok(output)
    }
 }