mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-16 17:18:43 +00:00
Add the Metaspace PreTokenizer
This commit is contained in:
@@ -39,6 +39,7 @@ fn pre_tokenizers(_py: Python, m: &PyModule) -> PyResult<()> {
|
||||
m.add_class::<pre_tokenizers::ByteLevel>()?;
|
||||
m.add_class::<pre_tokenizers::Whitespace>()?;
|
||||
m.add_class::<pre_tokenizers::BertPreTokenizer>()?;
|
||||
m.add_class::<pre_tokenizers::Metaspace>()?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ extern crate tokenizers as tk;
|
||||
|
||||
use super::error::{PyError, ToPyResult};
|
||||
use super::utils::Container;
|
||||
use pyo3::exceptions;
|
||||
use pyo3::prelude::*;
|
||||
use pyo3::types::*;
|
||||
use tk::tokenizer::{Offsets, Result};
|
||||
@@ -84,6 +85,41 @@ impl BertPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass]
|
||||
pub struct Metaspace {}
|
||||
#[pymethods]
|
||||
impl Metaspace {
|
||||
#[staticmethod]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
|
||||
let mut replacement = '▁';
|
||||
let mut add_prefix_space = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
let key: &str = key.extract()?;
|
||||
match key {
|
||||
"replacement" => {
|
||||
let s: &str = value.extract()?;
|
||||
replacement = s.chars().nth(0).ok_or(exceptions::Exception::py_err(
|
||||
"replacement must be a character",
|
||||
))?;
|
||||
}
|
||||
"add_prefix_space" => add_prefix_space = value.extract()?,
|
||||
_ => println!("Ignored unknown kwarg option {}", key),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(PreTokenizer {
|
||||
pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new(
|
||||
replacement,
|
||||
add_prefix_space,
|
||||
))),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Attempt at providing Python the ability to give its own PreTokenizer
|
||||
struct PyPreTokenizer {
|
||||
class: PyObject,
|
||||
|
||||
@@ -4,3 +4,4 @@ PreTokenizer = pre_tokenizers.PreTokenizer
|
||||
ByteLevel = pre_tokenizers.ByteLevel
|
||||
Whitespace = pre_tokenizers.Whitespace
|
||||
BertPreTokenizer = pre_tokenizers.BertPreTokenizer
|
||||
Metaspace = pre_tokenizers.Metaspace
|
||||
|
||||
@@ -26,8 +26,8 @@ class ByteLevel:
|
||||
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
Whether a space should be added at the very beginning of the sequence
|
||||
if there isn't one already.
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
|
||||
Returns:
|
||||
PreTokenizer
|
||||
@@ -66,3 +66,26 @@ class BertPreTokenizer:
|
||||
def new() -> PreTokenizer:
|
||||
""" Instantiate a new BertPreTokenizer """
|
||||
pass
|
||||
|
||||
class Metaspace:
|
||||
""" Metaspace pre-tokenizer
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def new(replacement: str="▁",
|
||||
add_prefix_space: bool=True) -> PreTokenizer:
|
||||
""" Instantiate a new Metaspace
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user