Introduce WordLevel model for TransformerXL (#125)

* Added lookup table model mapping string to id present in a vocab map.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* RustFmt

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Formatting.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Fix invalid void return on Rust side.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Python binding for LookupTable model

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Enable loading from Python's side.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Renamed LookupTable to WordLevel

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* RustFmt happy now.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* clippy happy now.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Addressing mismatching names.

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>

* Addressing mismatching names (one missing).

Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
This commit is contained in:
Funtowicz Morgan
2020-02-05 16:51:35 +00:00
committed by GitHub
parent 9770be5661
commit 8200112e9b
6 changed files with 244 additions and 2 deletions

View File

@@ -28,6 +28,7 @@ fn models(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<models::Model>()?;
m.add_class::<models::BPE>()?;
m.add_class::<models::WordPiece>()?;
m.add_class::<models::WordLevel>()?;
Ok(())
}

View File

@@ -153,3 +153,40 @@ impl WordPiece {
}
}
}
#[pyclass]
pub struct WordLevel {}
#[pymethods]
impl WordLevel {
#[staticmethod]
#[args(kwargs = "**")]
fn from_files(vocab: &str, kwargs: Option<&PyDict>) -> PyResult<Model> {
let mut unk_token = String::from("<unk>");
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
"unk_token" => unk_token = val.extract()?,
_ => println!("Ignored unknown kwargs option {}", key),
}
}
}
match tk::models::wordlevel::WordLevel::from_files(
vocab,
unk_token,
) {
Err(e) => {
println!("Errors: {:?}", e);
Err(exceptions::Exception::py_err(
"Error while initializing WordLevel",
))
}
Ok(model) => Ok(Model {
model: Container::Owned(Box::new(model)),
}),
}
}
}

View File

@@ -3,3 +3,4 @@ from .. import models
Model = models.Model
BPE = models.BPE
WordPiece = models.WordPiece
WordLevel = models.WordLevel

View File

@@ -17,7 +17,7 @@ class Model:
pass
class BPE:
class BPE(Model):
""" BytePairEncoding model class """
@staticmethod
@@ -62,7 +62,7 @@ class BPE:
pass
class WordPiece:
class WordPiece(Model):
""" WordPiece model class """
@staticmethod
@@ -87,3 +87,22 @@ class WordPiece:
def empty() -> Model:
""" Instantiate an empty WordPiece Model. """
pass
class WordLevel(Model):
"""
Most simple tokenizer model based on mapping token from a vocab file to their corresponding id.
"""
@staticmethod
def from_files(vocab: str, unk_token: str) -> Model:
""" Instantiate a WordLevel Model from the given vocab file.
Args:
vocab: string:
Path to a vocabulary file.
unk_token: str:
The unknown token to be used by the model.
"""
pass