mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 05:08:24 +00:00
Add BPEDecoder
This commit is contained in:
@@ -93,6 +93,31 @@ impl Metaspace {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct BPEDecoder {}
|
||||
#[pymethods]
|
||||
impl BPEDecoder {
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
|
||||
let mut suffix = String::from("</w");
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
"suffix" => suffix = value.extract()?,
|
||||
_ => println!("Ignored unknown kwarg option {}", key),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Decoder {
|
||||
decoder: Container::Owned(Box::new(tk::decoders::bpe::BPEDecoder::new(suffix))),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
struct PyDecoder {
|
||||
class: PyObject,
|
||||
}
|
||||
|
||||
@@ -50,6 +50,7 @@ fn decoders(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<decoders::ByteLevel>()?;
|
||||
m.add_class::<decoders::WordPiece>()?;
|
||||
m.add_class::<decoders::Metaspace>()?;
|
||||
m.add_class::<decoders::BPEDecoder>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -4,3 +4,4 @@ Decoder = decoders.Decoder
|
||||
ByteLevel = decoders.ByteLevel
|
||||
WordPiece = decoders.WordPiece
|
||||
Metaspace = decoders.Metaspace
|
||||
BPEDecoder = decoders.BPEDecoder
|
||||
|
||||
@@ -50,3 +50,17 @@ class Metaspace:
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
pass
|
||||
|
||||
class BPEDecoder:
|
||||
""" BPEDecoder """
|
||||
|
||||
@staticmethod
|
||||
def new(suffix: str="</w>") -> Decoder:
|
||||
""" Instantiate a new BPEDecoder
|
||||
|
||||
Args:
|
||||
suffix: str:
|
||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
be replaced by whitespaces during the decoding
|
||||
"""
|
||||
pass
|
||||
|
||||
25
tokenizers/src/decoders/bpe.rs
Normal file
25
tokenizers/src/decoders/bpe.rs
Normal file
@@ -0,0 +1,25 @@
|
||||
use crate::tokenizer::{Decoder, Result};
|
||||
|
||||
/// Allows decoding Original BPE by joining all the tokens and then replacing
|
||||
/// the suffix used to identify end-of-words by whitespaces
|
||||
pub struct BPEDecoder {
|
||||
suffix: String,
|
||||
}
|
||||
|
||||
impl BPEDecoder {
|
||||
pub fn new(suffix: String) -> Self {
|
||||
BPEDecoder { suffix }
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BPEDecoder {
|
||||
fn default() -> Self {
|
||||
BPEDecoder::new("</w>".into())
|
||||
}
|
||||
}
|
||||
|
||||
impl Decoder for BPEDecoder {
|
||||
fn decode(&self, tokens: Vec<String>) -> Result<String> {
|
||||
Ok(tokens.join("").replace(&self.suffix, " "))
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
pub mod bpe;
|
||||
pub mod wordpiece;
|
||||
|
||||
// Re-export these as decoders
|
||||
|
||||
Reference in New Issue
Block a user