Add BPEDecoder

2025-12-07 05:08:24 +00:00 · 2020-01-07 19:56:49 -05:00
parent 5bc1e2ee05
commit 243a45af40
6 changed files with 67 additions and 0 deletions
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -93,6 +93,31 @@ impl Metaspace {
    }
 }

+#[pyclass]
+pub struct BPEDecoder {}
+#[pymethods]
+impl BPEDecoder {
+    #[staticmethod]
+    #[args(kwargs = "**")]
+    fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
+        let mut suffix = String::from("</w");
+
+        if let Some(kwargs) = kwargs {
+            for (key, value) in kwargs {
+                let key: &str = key.extract()?;
+                match key {
+                    "suffix" => suffix = value.extract()?,
+                    _ => println!("Ignored unknown kwarg option {}", key),
+                }
+            }
+        }
+
+        Ok(Decoder {
+            decoder: Container::Owned(Box::new(tk::decoders::bpe::BPEDecoder::new(suffix))),
+        })
+    }
+}
+
 struct PyDecoder {
    class: PyObject,
 }
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -50,6 +50,7 @@ fn decoders(_py: Python, m: &PyModule) -> PyResult<()> {
    m.add_class::<decoders::ByteLevel>()?;
    m.add_class::<decoders::WordPiece>()?;
    m.add_class::<decoders::Metaspace>()?;
+    m.add_class::<decoders::BPEDecoder>()?;
    Ok(())
 }

--- a/bindings/python/tokenizers/decoders/init.py
+++ b/bindings/python/tokenizers/decoders/init.py
@@ -4,3 +4,4 @@ Decoder = decoders.Decoder
 ByteLevel = decoders.ByteLevel
 WordPiece = decoders.WordPiece
 Metaspace = decoders.Metaspace
+BPEDecoder = decoders.BPEDecoder
--- a/bindings/python/tokenizers/decoders/init.pyi
+++ b/bindings/python/tokenizers/decoders/init.pyi
@@ -50,3 +50,17 @@ class Metaspace:
                lets us treat `hello` exactly like `say hello`.
        """
        pass
+
+class BPEDecoder:
+    """ BPEDecoder """
+
+    @staticmethod
+    def new(suffix: str="</w>") -> Decoder:
+        """ Instantiate a new BPEDecoder
+
+        Args:
+            suffix: str:
+                The suffix that was used to caracterize an end-of-word. This suffix will
+                be replaced by whitespaces during the decoding
+        """
+        pass
--- a/tokenizers/src/decoders/bpe.rs
+++ b/tokenizers/src/decoders/bpe.rs
@@ -0,0 +1,25 @@
+use crate::tokenizer::{Decoder, Result};
+
+/// Allows decoding Original BPE by joining all the tokens and then replacing
+/// the suffix used to identify end-of-words by whitespaces
+pub struct BPEDecoder {
+    suffix: String,
+}
+
+impl BPEDecoder {
+    pub fn new(suffix: String) -> Self {
+        BPEDecoder { suffix }
+    }
+}
+
+impl Default for BPEDecoder {
+    fn default() -> Self {
+        BPEDecoder::new("</w>".into())
+    }
+}
+
+impl Decoder for BPEDecoder {
+    fn decode(&self, tokens: Vec<String>) -> Result<String> {
+        Ok(tokens.join("").replace(&self.suffix, " "))
+    }
+}
--- a/tokenizers/src/decoders/mod.rs
+++ b/tokenizers/src/decoders/mod.rs
@@ -1,3 +1,4 @@
+pub mod bpe;
 pub mod wordpiece;

 // Re-export these as decoders