Python - Bindings for Wordpiece decoder's cleanup

2025-12-07 13:18:31 +00:00 · 2020-02-13 17:50:37 -05:00
parent 4271c173e5
commit 1907b74d1c
2 changed files with 12 additions and 3 deletions
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -43,18 +43,24 @@ pub struct WordPiece {}
 #[pymethods]
 impl WordPiece {
    #[new]
-    #[args(kwargs="**")]
+    #[args(kwargs = "**")]
    fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
        let mut prefix = String::from("##");
+        let mut cleanup = true;

        if let Some(kwargs) = kwargs {
            if let Some(p) = kwargs.get_item("prefix") {
                prefix = p.extract()?;
            }
+            if let Some(c) = kwargs.get_item("cleanup") {
+                cleanup = c.extract()?;
+            }
        }

        Ok(obj.init(Decoder {
-            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(prefix))),
+            decoder: Container::Owned(Box::new(tk::decoders::wordpiece::WordPiece::new(
+                prefix, cleanup,
+            ))),
        }))
    }
 }
--- a/bindings/python/tokenizers/decoders/init.pyi
+++ b/bindings/python/tokenizers/decoders/init.pyi
@@ -22,12 +22,15 @@ class WordPiece(Decoder):
    """ WordPiece Decoder """

    @staticmethod
-    def __init__(self, prefix: str = "##") -> Decoder:
+    def __init__(self, prefix: str = "##", cleanup: bool = True) -> Decoder:
        """ Instantiate a new WordPiece Decoder

        Args:
            prefix: str:
                The prefix to use for subwords that are not a beginning-of-word
+            cleanup: bool:
+                Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation,
+                and some abbreviated english forms.
        """
        pass