Merge pull request #147 from huggingface/wordpiece-cleanup

Wordpiece Decoder cleanup
2025-12-07 21:28:19 +00:00 · 2020-02-14 13:12:15 -05:00
parent c4bac6aeeb 2aa8366a14
commit 3cac26cdb2
5 changed files with 48 additions and 9 deletions
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -43,18 +43,24 @@ pub struct WordPiece {}
 #[pymethods]
 impl WordPiece {
    #[new]
-    #[args(kwargs="**")]
+    #[args(kwargs = "**")]
    fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
        let mut prefix = String::from("##");
+        let mut cleanup = true;

        if let Some(kwargs) = kwargs {
            if let Some(p) = kwargs.get_item("prefix") {
                prefix = p.extract()?;
            }
+            if let Some(c) = kwargs.get_item("cleanup") {
+                cleanup = c.extract()?;
+            }
        }

        Ok(obj.init(Decoder {
-            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
+            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
+                prefix, cleanup,
+            ))),
        }))
    }
 }
--- a/bindings/python/tokenizers/decoders/init.pyi
+++ b/bindings/python/tokenizers/decoders/init.pyi
@@ -22,12 +22,15 @@ class WordPiece(Decoder):
    """ WordPiece Decoder """

    @staticmethod
-    def __init__(self, prefix: str = "##") -> Decoder:
+    def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
        """ Instantiate a new WordPiece Decoder

        Args:
            prefix: str:
                The prefix to use for subwords that are not a beginning-of-word
+            cleanup: bool:
+                Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
+                and some abbreviated english forms.
        """
        pass