mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 21:28:19 +00:00
Add BPEDecoder
This commit is contained in:
@@ -93,6 +93,31 @@ impl Metaspace {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
pub struct BPEDecoder {}
|
||||||
|
#[pymethods]
|
||||||
|
impl BPEDecoder {
|
||||||
|
#[staticmethod]
|
||||||
|
#[args(kwargs = "**")]
|
||||||
|
fn new(kwargs: Option<&PyDict>) -> PyResult<Decoder> {
|
||||||
|
let mut suffix = String::from("</w");
|
||||||
|
|
||||||
|
if let Some(kwargs) = kwargs {
|
||||||
|
for (key, value) in kwargs {
|
||||||
|
let key: &str = key.extract()?;
|
||||||
|
match key {
|
||||||
|
"suffix" => suffix = value.extract()?,
|
||||||
|
_ => println!("Ignored unknown kwarg option {}", key),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(Decoder {
|
||||||
|
decoder: Container::Owned(Box::new(tk::decoders::bpe::BPEDecoder::new(suffix))),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct PyDecoder {
|
struct PyDecoder {
|
||||||
class: PyObject,
|
class: PyObject,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ fn decoders(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<decoders::ByteLevel>()?;
|
m.add_class::<decoders::ByteLevel>()?;
|
||||||
m.add_class::<decoders::WordPiece>()?;
|
m.add_class::<decoders::WordPiece>()?;
|
||||||
m.add_class::<decoders::Metaspace>()?;
|
m.add_class::<decoders::Metaspace>()?;
|
||||||
|
m.add_class::<decoders::BPEDecoder>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,3 +4,4 @@ Decoder = decoders.Decoder
|
|||||||
ByteLevel = decoders.ByteLevel
|
ByteLevel = decoders.ByteLevel
|
||||||
WordPiece = decoders.WordPiece
|
WordPiece = decoders.WordPiece
|
||||||
Metaspace = decoders.Metaspace
|
Metaspace = decoders.Metaspace
|
||||||
|
BPEDecoder = decoders.BPEDecoder
|
||||||
|
|||||||
@@ -50,3 +50,17 @@ class Metaspace:
|
|||||||
lets us treat `hello` exactly like `say hello`.
|
lets us treat `hello` exactly like `say hello`.
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class BPEDecoder:
|
||||||
|
""" BPEDecoder """
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new(suffix: str="</w>") -> Decoder:
|
||||||
|
""" Instantiate a new BPEDecoder
|
||||||
|
|
||||||
|
Args:
|
||||||
|
suffix: str:
|
||||||
|
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||||
|
be replaced by whitespaces during the decoding
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|||||||
25
tokenizers/src/decoders/bpe.rs
Normal file
25
tokenizers/src/decoders/bpe.rs
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
use crate::tokenizer::{Decoder, Result};
|
||||||
|
|
||||||
|
/// Allows decoding Original BPE by joining all the tokens and then replacing
|
||||||
|
/// the suffix used to identify end-of-words by whitespaces
|
||||||
|
pub struct BPEDecoder {
|
||||||
|
suffix: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BPEDecoder {
|
||||||
|
pub fn new(suffix: String) -> Self {
|
||||||
|
BPEDecoder { suffix }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for BPEDecoder {
|
||||||
|
fn default() -> Self {
|
||||||
|
BPEDecoder::new("</w>".into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Decoder for BPEDecoder {
|
||||||
|
fn decode(&self, tokens: Vec<String>) -> Result<String> {
|
||||||
|
Ok(tokens.join("").replace(&self.suffix, " "))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
|
pub mod bpe;
|
||||||
pub mod wordpiece;
|
pub mod wordpiece;
|
||||||
|
|
||||||
// Re-export these as decoders
|
// Re-export these as decoders
|
||||||
|
|||||||
Reference in New Issue
Block a user