mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 12:48:18 +00:00
Add the Metaspace PreTokenizer
This commit is contained in:
@@ -39,6 +39,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
||||
m.add_class::<pre_tokenizers::Whitespace>()?;
|
||||
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
||||
m.add_class::<pre_tokenizers::Metaspace>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ extern crate tokenizers as tk;
|
||||
|
||||
use super::error::{PyError, ToPyResult};
|
||||
use super::utils::Container;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use tk::tokenizer::{Offsets, Result};
|
||||
@@ -84,6 +85,41 @@ impl BertPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct Metaspace {}
|
||||
#[pymethods]
|
||||
impl Metaspace {
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
||||
let mut replacement = '▁';
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
"replacement" => {
|
||||
let s: &str = value.extract()?;
|
||||
replacement = s.chars().nth(0).ok_or(exceptions::Exception::py_err(
|
||||
"replacement must be a character",
|
||||
))?;
|
||||
}
|
||||
"add_prefix_space" => add_prefix_space = value.extract()?,
|
||||
_ => println!("Ignored unknown kwarg option {}", key),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new(
|
||||
replacement,
|
||||
add_prefix_space,
|
||||
))),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt at providing Python the ability to give its own PreTokenizer
|
||||
struct PyPreTokenizer {
|
||||
class: PyObject,
|
||||
|
||||
@@ -4,3 +4,4 @@ PreTokenizer = pre_tokenizers.PreTokenizer
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
|
||||
@@ -26,8 +26,8 @@ class ByteLevel:
|
||||
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether a space should be added at the very beginning of the sequence
|
||||
if there isn't one already.
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
|
||||
Returns:
|
||||
PreTokenizer
|
||||
@@ -66,3 +66,26 @@ class BertPreTokenizer:
|
||||
def new() -> PreTokenizer:
|
||||
""" Instantiate a new BertPreTokenizer """
|
||||
pass
|
||||
|
||||
class Metaspace:
|
||||
""" Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new(replacement: str="▁",
|
||||
add_prefix_space: bool=True) -> PreTokenizer:
|
||||
""" Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
pass
|
||||
|
||||
83
tokenizers/src/pre_tokenizers/metaspace.rs
Normal file
83
tokenizers/src/pre_tokenizers/metaspace.rs
Normal file
@@ -0,0 +1,83 @@
|
||||
use crate::tokenizer::{Offsets, PreTokenizer, Result};
|
||||
|
||||
pub struct Metaspace {
|
||||
replacement: char,
|
||||
add_prefix_space: bool,
|
||||
}
|
||||
|
||||
impl Metaspace {
|
||||
pub fn new(replacement: char, add_prefix_space: bool) -> Self {
|
||||
Self {
|
||||
replacement,
|
||||
add_prefix_space,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Metaspace {
|
||||
fn default() -> Self {
|
||||
Self::new('▁', true)
|
||||
}
|
||||
}
|
||||
|
||||
impl PreTokenizer for Metaspace {
|
||||
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
|
||||
let s = if self.add_prefix_space && !s.starts_with(' ') {
|
||||
format!(" {}", s)
|
||||
} else {
|
||||
s.to_owned()
|
||||
};
|
||||
|
||||
let mut words = vec![];
|
||||
let mut word = Vec::with_capacity(1000);
|
||||
let mut offset = 0;
|
||||
s.chars().for_each(|c| {
|
||||
if c.is_whitespace() {
|
||||
if !word.is_empty() {
|
||||
let offsets = (offset - word.len(), offset);
|
||||
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||
}
|
||||
word.push(self.replacement)
|
||||
} else {
|
||||
word.push(c);
|
||||
}
|
||||
offset += 1;
|
||||
});
|
||||
if !word.is_empty() {
|
||||
let offsets = (offset - word.len(), offset);
|
||||
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||
}
|
||||
|
||||
Ok(words)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let pretok = Metaspace::default();
|
||||
let res = pretok.pre_tokenize("Hey friend!").unwrap();
|
||||
assert_eq!(
|
||||
&res,
|
||||
&[("▁Hey".into(), (0, 4)), ("▁friend!".into(), (4, 12)),]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_spaces() {
|
||||
let pretok = Metaspace::default();
|
||||
let res = pretok.pre_tokenize("Hey friend!").unwrap();
|
||||
assert_eq!(
|
||||
&res,
|
||||
&[
|
||||
("▁Hey".into(), (0, 4)),
|
||||
("▁".into(), (4, 5)),
|
||||
("▁".into(), (5, 6)),
|
||||
("▁friend!".into(), (6, 14)),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -1,3 +1,4 @@
|
||||
pub mod bert;
|
||||
pub mod byte_level;
|
||||
pub mod metaspace;
|
||||
pub mod whitespace;
|
||||
|
||||
Reference in New Issue
Block a user