Add the Metaspace PreTokenizer

This commit is contained in:
Anthony MOI
2020-01-07 12:59:59 -05:00
parent 49a67824ce
commit eaa23ac8e6
6 changed files with 147 additions and 2 deletions

View File

@@ -39,6 +39,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<pre_tokenizers::ByteLevel>()?;
m.add_class::<pre_tokenizers::Whitespace>()?;
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
m.add_class::<pre_tokenizers::Metaspace>()?;
Ok(())
}

View File

@@ -2,6 +2,7 @@ extern crate tokenizers as tk;
use super::error::{PyError, ToPyResult};
use super::utils::Container;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use tk::tokenizer::{Offsets, Result};
@@ -84,6 +85,41 @@ impl BertPreTokenizer {
}
}
#[pyclass]
pub struct Metaspace {}
#[pymethods]
impl Metaspace {
#[staticmethod]
#[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
let mut replacement = '▁';
let mut add_prefix_space = true;
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"replacement" => {
let s: &str = value.extract()?;
replacement = s.chars().nth(0).ok_or(exceptions::Exception::py_err(
"replacement must be a character",
))?;
}
"add_prefix_space" => add_prefix_space = value.extract()?,
_ => println!("Ignored unknown kwarg option {}", key),
}
}
}
Ok(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new(
replacement,
add_prefix_space,
))),
})
}
}
/// Attempt at providing Python the ability to give its own PreTokenizer
struct PyPreTokenizer {
class: PyObject,

View File

@@ -4,3 +4,4 @@ PreTokenizer = pre_tokenizers.PreTokenizer
ByteLevel = pre_tokenizers.ByteLevel
Whitespace = pre_tokenizers.Whitespace
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace

View File

@@ -26,8 +26,8 @@ class ByteLevel:
Args:
add_prefix_space: (`optional`) boolean:
Whether a space should be added at the very beginning of the sequence
if there isn't one already.
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
Returns:
PreTokenizer
@@ -66,3 +66,26 @@ class BertPreTokenizer:
def new() -> PreTokenizer:
""" Instantiate a new BertPreTokenizer """
pass
class Metaspace:
""" Metaspace pre-tokenizer
This pre-tokenizer replaces any whitespace by the provided replacement character.
It then tries to split on these spaces.
"""
@staticmethod
def new(replacement: str="",
add_prefix_space: bool=True) -> PreTokenizer:
""" Instantiate a new Metaspace
Args:
replacement: str:
The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
add_prefix_space: boolean:
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
pass