Add the Metaspace PreTokenizer

This commit is contained in:
Anthony MOI
2020-01-07 12:59:59 -05:00
parent 49a67824ce
commit eaa23ac8e6
6 changed files with 147 additions and 2 deletions

View File

@@ -39,6 +39,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<pre_tokenizers::ByteLevel>()?;
m.add_class::<pre_tokenizers::Whitespace>()?;
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
m.add_class::<pre_tokenizers::Metaspace>()?;
Ok(())
}

View File

@@ -2,6 +2,7 @@ extern crate tokenizers as tk;
use super::error::{PyError, ToPyResult};
use super::utils::Container;
use pyo3::exceptions;
use pyo3::prelude::*;
use pyo3::types::*;
use tk::tokenizer::{Offsets, Result};
@@ -84,6 +85,41 @@ impl BertPreTokenizer {
}
}
#[pyclass]
pub struct Metaspace {}
#[pymethods]
impl Metaspace {
#[staticmethod]
#[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
let mut replacement = '▁';
let mut add_prefix_space = true;
if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
let key: &str = key.extract()?;
match key {
"replacement" => {
let s: &str = value.extract()?;
replacement = s.chars().nth(0).ok_or(exceptions::Exception::py_err(
"replacement must be a character",
))?;
}
"add_prefix_space" => add_prefix_space = value.extract()?,
_ => println!("Ignored unknown kwarg option {}", key),
}
}
}
Ok(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new(
replacement,
add_prefix_space,
))),
})
}
}
/// Attempt at providing Python the ability to give its own PreTokenizer
struct PyPreTokenizer {
class: PyObject,

View File

@@ -4,3 +4,4 @@ PreTokenizer = pre_tokenizers.PreTokenizer
ByteLevel = pre_tokenizers.ByteLevel
Whitespace = pre_tokenizers.Whitespace
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
Metaspace = pre_tokenizers.Metaspace

View File

@@ -26,8 +26,8 @@ class ByteLevel:
Args:
add_prefix_space: (`optional`) boolean:
Whether a space should be added at the very beginning of the sequence
if there isn't one already.
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
Returns:
PreTokenizer
@@ -66,3 +66,26 @@ class BertPreTokenizer:
def new() -> PreTokenizer:
""" Instantiate a new BertPreTokenizer """
pass
class Metaspace:
""" Metaspace pre-tokenizer
This pre-tokenizer replaces any whitespace by the provided replacement character.
It then tries to split on these spaces.
"""
@staticmethod
def new(replacement: str="",
add_prefix_space: bool=True) -> PreTokenizer:
""" Instantiate a new Metaspace
Args:
replacement: str:
The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
add_prefix_space: boolean:
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
pass

View File

@@ -0,0 +1,83 @@
use crate::tokenizer::{Offsets, PreTokenizer, Result};
pub struct Metaspace {
replacement: char,
add_prefix_space: bool,
}
impl Metaspace {
pub fn new(replacement: char, add_prefix_space: bool) -> Self {
Self {
replacement,
add_prefix_space,
}
}
}
impl Default for Metaspace {
fn default() -> Self {
Self::new('▁', true)
}
}
impl PreTokenizer for Metaspace {
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
let s = if self.add_prefix_space && !s.starts_with(' ') {
format!(" {}", s)
} else {
s.to_owned()
};
let mut words = vec![];
let mut word = Vec::with_capacity(1000);
let mut offset = 0;
s.chars().for_each(|c| {
if c.is_whitespace() {
if !word.is_empty() {
let offsets = (offset - word.len(), offset);
words.push((word.drain(0..).collect::<String>(), offsets));
}
word.push(self.replacement)
} else {
word.push(c);
}
offset += 1;
});
if !word.is_empty() {
let offsets = (offset - word.len(), offset);
words.push((word.drain(0..).collect::<String>(), offsets));
}
Ok(words)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic() {
let pretok = Metaspace::default();
let res = pretok.pre_tokenize("Hey friend!").unwrap();
assert_eq!(
&res,
&[("▁Hey".into(), (0, 4)), ("▁friend!".into(), (4, 12)),]
);
}
#[test]
fn multiple_spaces() {
let pretok = Metaspace::default();
let res = pretok.pre_tokenize("Hey friend!").unwrap();
assert_eq!(
&res,
&[
("▁Hey".into(), (0, 4)),
("".into(), (4, 5)),
("".into(), (5, 6)),
("▁friend!".into(), (6, 14)),
]
);
}
}

View File

@@ -1,3 +1,4 @@
pub mod bert;
pub mod byte_level;
pub mod metaspace;
pub mod whitespace;