mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 13:18:31 +00:00
Introduce WordLevel model for TransformerXL (#125)
* Added lookup table model mapping string to id present in a vocab map. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * RustFmt Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Formatting. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Fix invalid void return on Rust side. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Python binding for LookupTable model Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Enable loading from Python's side. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Renamed LookupTable to WordLevel Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * RustFmt happy now. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * clippy happy now. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Addressing mismatching names. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Addressing mismatching names (one missing). Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
This commit is contained in:
@@ -28,6 +28,7 @@ fn models(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<models::Model>()?;
|
||||
m.add_class::<models::BPE>()?;
|
||||
m.add_class::<models::WordPiece>()?;
|
||||
m.add_class::<models::WordLevel>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -153,3 +153,40 @@ impl WordPiece {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct WordLevel {}
|
||||
|
||||
#[pymethods]
|
||||
impl WordLevel {
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn from_files(vocab: &str, kwargs: Option<&PyDict>) -> PyResult<Model> {
|
||||
let mut unk_token = String::from("<unk>");
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, val) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
"unk_token" => unk_token = val.extract()?,
|
||||
_ => println!("Ignored unknown kwargs option {}", key),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match tk::models::wordlevel::WordLevel::from_files(
|
||||
vocab,
|
||||
unk_token,
|
||||
) {
|
||||
Err(e) => {
|
||||
println!("Errors: {:?}", e);
|
||||
Err(exceptions::Exception::py_err(
|
||||
"Error while initializing WordLevel",
|
||||
))
|
||||
}
|
||||
Ok(model) => Ok(Model {
|
||||
model: Container::Owned(Box::new(model)),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,3 +3,4 @@ from .. import models
|
||||
Model = models.Model
|
||||
BPE = models.BPE
|
||||
WordPiece = models.WordPiece
|
||||
WordLevel = models.WordLevel
|
||||
@@ -17,7 +17,7 @@ class Model:
|
||||
pass
|
||||
|
||||
|
||||
class BPE:
|
||||
class BPE(Model):
|
||||
""" BytePairEncoding model class """
|
||||
|
||||
@staticmethod
|
||||
@@ -62,7 +62,7 @@ class BPE:
|
||||
pass
|
||||
|
||||
|
||||
class WordPiece:
|
||||
class WordPiece(Model):
|
||||
""" WordPiece model class """
|
||||
|
||||
@staticmethod
|
||||
@@ -87,3 +87,22 @@ class WordPiece:
|
||||
def empty() -> Model:
|
||||
""" Instantiate an empty WordPiece Model. """
|
||||
pass
|
||||
|
||||
|
||||
class WordLevel(Model):
|
||||
"""
|
||||
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def from_files(vocab: str, unk_token: str) -> Model:
|
||||
""" Instantiate a WordLevel Model from the given vocab file.
|
||||
|
||||
Args:
|
||||
vocab: string:
|
||||
Path to a vocabulary file.
|
||||
|
||||
unk_token: str:
|
||||
The unknown token to be used by the model.
|
||||
"""
|
||||
pass
|
||||
Reference in New Issue
Block a user