mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 21:28:19 +00:00
Add the Metaspace PreTokenizer
This commit is contained in:
@@ -39,6 +39,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
|||||||
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
||||||
m.add_class::<pre_tokenizers::Whitespace>()?;
|
m.add_class::<pre_tokenizers::Whitespace>()?;
|
||||||
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
||||||
|
m.add_class::<pre_tokenizers::Metaspace>()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ extern crate tokenizers as tk;
|
|||||||
|
|
||||||
use super::error::{PyError, ToPyResult};
|
use super::error::{PyError, ToPyResult};
|
||||||
use super::utils::Container;
|
use super::utils::Container;
|
||||||
|
use pyo3::exceptions;
|
||||||
use pyo3::prelude::*;
|
use pyo3::prelude::*;
|
||||||
use pyo3::types::*;
|
use pyo3::types::*;
|
||||||
use tk::tokenizer::{Offsets, Result};
|
use tk::tokenizer::{Offsets, Result};
|
||||||
@@ -84,6 +85,41 @@ impl BertPreTokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[pyclass]
|
||||||
|
pub struct Metaspace {}
|
||||||
|
#[pymethods]
|
||||||
|
impl Metaspace {
|
||||||
|
#[staticmethod]
|
||||||
|
#[args(kwargs = "**")]
|
||||||
|
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
||||||
|
let mut replacement = '▁';
|
||||||
|
let mut add_prefix_space = true;
|
||||||
|
|
||||||
|
if let Some(kwargs) = kwargs {
|
||||||
|
for (key, value) in kwargs {
|
||||||
|
let key: &str = key.extract()?;
|
||||||
|
match key {
|
||||||
|
"replacement" => {
|
||||||
|
let s: &str = value.extract()?;
|
||||||
|
replacement = s.chars().nth(0).ok_or(exceptions::Exception::py_err(
|
||||||
|
"replacement must be a character",
|
||||||
|
))?;
|
||||||
|
}
|
||||||
|
"add_prefix_space" => add_prefix_space = value.extract()?,
|
||||||
|
_ => println!("Ignored unknown kwarg option {}", key),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(PreTokenizer {
|
||||||
|
pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new(
|
||||||
|
replacement,
|
||||||
|
add_prefix_space,
|
||||||
|
))),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Attempt at providing Python the ability to give its own PreTokenizer
|
/// Attempt at providing Python the ability to give its own PreTokenizer
|
||||||
struct PyPreTokenizer {
|
struct PyPreTokenizer {
|
||||||
class: PyObject,
|
class: PyObject,
|
||||||
|
|||||||
@@ -4,3 +4,4 @@ PreTokenizer = pre_tokenizers.PreTokenizer
|
|||||||
ByteLevel = pre_tokenizers.ByteLevel
|
ByteLevel = pre_tokenizers.ByteLevel
|
||||||
Whitespace = pre_tokenizers.Whitespace
|
Whitespace = pre_tokenizers.Whitespace
|
||||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||||
|
Metaspace = pre_tokenizers.Metaspace
|
||||||
|
|||||||
@@ -26,8 +26,8 @@ class ByteLevel:
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
add_prefix_space: (`optional`) boolean:
|
add_prefix_space: (`optional`) boolean:
|
||||||
Whether a space should be added at the very beginning of the sequence
|
Whether to add a space to the first word if there isn't already one. This
|
||||||
if there isn't one already.
|
lets us treat `hello` exactly like `say hello`.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
PreTokenizer
|
PreTokenizer
|
||||||
@@ -66,3 +66,26 @@ class BertPreTokenizer:
|
|||||||
def new() -> PreTokenizer:
|
def new() -> PreTokenizer:
|
||||||
""" Instantiate a new BertPreTokenizer """
|
""" Instantiate a new BertPreTokenizer """
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class Metaspace:
|
||||||
|
""" Metaspace pre-tokenizer
|
||||||
|
|
||||||
|
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||||
|
It then tries to split on these spaces.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def new(replacement: str="▁",
|
||||||
|
add_prefix_space: bool=True) -> PreTokenizer:
|
||||||
|
""" Instantiate a new Metaspace
|
||||||
|
|
||||||
|
Args:
|
||||||
|
replacement: str:
|
||||||
|
The replacement character. Must be exactly one character. By default we
|
||||||
|
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||||
|
|
||||||
|
add_prefix_space: boolean:
|
||||||
|
Whether to add a space to the first word if there isn't already one. This
|
||||||
|
lets us treat `hello` exactly like `say hello`.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|||||||
83
tokenizers/src/pre_tokenizers/metaspace.rs
Normal file
83
tokenizers/src/pre_tokenizers/metaspace.rs
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
use crate::tokenizer::{Offsets, PreTokenizer, Result};
|
||||||
|
|
||||||
|
pub struct Metaspace {
|
||||||
|
replacement: char,
|
||||||
|
add_prefix_space: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Metaspace {
|
||||||
|
pub fn new(replacement: char, add_prefix_space: bool) -> Self {
|
||||||
|
Self {
|
||||||
|
replacement,
|
||||||
|
add_prefix_space,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for Metaspace {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new('▁', true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PreTokenizer for Metaspace {
|
||||||
|
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>> {
|
||||||
|
let s = if self.add_prefix_space && !s.starts_with(' ') {
|
||||||
|
format!(" {}", s)
|
||||||
|
} else {
|
||||||
|
s.to_owned()
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut words = vec![];
|
||||||
|
let mut word = Vec::with_capacity(1000);
|
||||||
|
let mut offset = 0;
|
||||||
|
s.chars().for_each(|c| {
|
||||||
|
if c.is_whitespace() {
|
||||||
|
if !word.is_empty() {
|
||||||
|
let offsets = (offset - word.len(), offset);
|
||||||
|
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||||
|
}
|
||||||
|
word.push(self.replacement)
|
||||||
|
} else {
|
||||||
|
word.push(c);
|
||||||
|
}
|
||||||
|
offset += 1;
|
||||||
|
});
|
||||||
|
if !word.is_empty() {
|
||||||
|
let offsets = (offset - word.len(), offset);
|
||||||
|
words.push((word.drain(0..).collect::<String>(), offsets));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(words)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn basic() {
|
||||||
|
let pretok = Metaspace::default();
|
||||||
|
let res = pretok.pre_tokenize("Hey friend!").unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
&res,
|
||||||
|
&[("▁Hey".into(), (0, 4)), ("▁friend!".into(), (4, 12)),]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn multiple_spaces() {
|
||||||
|
let pretok = Metaspace::default();
|
||||||
|
let res = pretok.pre_tokenize("Hey friend!").unwrap();
|
||||||
|
assert_eq!(
|
||||||
|
&res,
|
||||||
|
&[
|
||||||
|
("▁Hey".into(), (0, 4)),
|
||||||
|
("▁".into(), (4, 5)),
|
||||||
|
("▁".into(), (5, 6)),
|
||||||
|
("▁friend!".into(), (6, 14)),
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,3 +1,4 @@
|
|||||||
pub mod bert;
|
pub mod bert;
|
||||||
pub mod byte_level;
|
pub mod byte_level;
|
||||||
|
pub mod metaspace;
|
||||||
pub mod whitespace;
|
pub mod whitespace;
|
||||||
|
|||||||
Reference in New Issue
Block a user