mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Python - Add WordPiece model
This commit is contained in:
@ -5,8 +5,8 @@ use super::utils::Container;
|
|||||||
use pyo3::exceptions;
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
|
|
||||||
/// Represents any Model to be used with a Tokenizer
|
/// A Model represents some tokenization algorithm like BPE or Word
|
||||||
/// This class is to be constructed from specific models
|
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct Model {
|
pub struct Model {
|
||||||
pub model: Container<dyn tk::tokenizer::Model + Sync>,
|
pub model: Container<dyn tk::tokenizer::Model + Sync>,
|
||||||
@ -15,9 +15,9 @@ pub struct Model {
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Model {
|
impl Model {
|
||||||
#[new]
|
#[new]
|
||||||
fn new(_obj: &PyRawObject) -> PyResult<Self> {
|
fn new(obj: &PyRawObject) -> PyResult<()> {
|
||||||
Err(exceptions::Exception::py_err(
|
Err(exceptions::Exception::py_err(
|
||||||
"Cannot create a Model directly",
|
"Cannot create a Model directly. Use a concrete subclass",
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -29,6 +29,10 @@ pub struct BPE {}
|
|||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl BPE {
|
impl BPE {
|
||||||
|
/// from_files(vocab, merges, /)
|
||||||
|
/// --
|
||||||
|
///
|
||||||
|
/// Instanciate a new BPE model using the provided vocab and merges files
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
fn from_files(vocab: &str, merges: &str) -> PyResult<Model> {
|
fn from_files(vocab: &str, merges: &str) -> PyResult<Model> {
|
||||||
match tk::models::bpe::BPE::from_files(vocab, merges) {
|
match tk::models::bpe::BPE::from_files(vocab, merges) {
|
||||||
@ -44,6 +48,10 @@ impl BPE {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// empty()
|
||||||
|
/// --
|
||||||
|
///
|
||||||
|
/// Instanciate a new BPE model with empty vocab and merges
|
||||||
#[staticmethod]
|
#[staticmethod]
|
||||||
fn empty() -> Model {
|
fn empty() -> Model {
|
||||||
Model {
|
Model {
|
||||||
@ -51,3 +59,37 @@ impl BPE {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// WordPiece Model
|
||||||
|
#[pyclass]
|
||||||
|
pub struct WordPiece {}
|
||||||
|
|
||||||
|
#[pymethods]
|
||||||
|
impl WordPiece {
|
||||||
|
/// from_files(vocab, /)
|
||||||
|
/// --
|
||||||
|
///
|
||||||
|
/// Instantiate a new WordPiece model using the provided vocabulary file
|
||||||
|
#[staticmethod]
|
||||||
|
fn from_files(vocab: &str) -> PyResult<Model> {
|
||||||
|
// TODO: Parse kwargs for these
|
||||||
|
let unk_token = String::from("[UNK]");
|
||||||
|
let max_input_chars_per_word = Some(100);
|
||||||
|
|
||||||
|
match tk::models::wordpiece::WordPiece::from_files(
|
||||||
|
vocab,
|
||||||
|
unk_token,
|
||||||
|
max_input_chars_per_word,
|
||||||
|
) {
|
||||||
|
Err(e) => {
|
||||||
|
println!("Errors: {:?}", e);
|
||||||
|
Err(exceptions::Exception::py_err(
|
||||||
|
"Error while initializing WordPiece",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
Ok(wordpiece) => Ok(Model {
|
||||||
|
model: Container::Owned(Box::new(wordpiece)),
|
||||||
|
}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Reference in New Issue
Block a user