mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
[pre_tokenizers
] Fix sentencepiece based Metaspace (#1357)
* nits * allow for legacy beahaviour without making any breaking changes * add a todo * set to legacy by default * skip legacy serialization * push correct update * lint * add deserialization test * add a python test as well * updates * fix serialization tests * nits * python stylijng of the tests * better tests * fix offsets * fix imports * fmt * update metaspace * remove TODO * use enm * fix some tses * nits * use enum * update tests * syling * remove impl from for PrependScheme * use simple getters and setters * lint * update tests * add test new == new_with_prepend_scheme * revert a change * use setters and getterts * Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> * nits * use copy rather than ref * nits format * more nits * allow option string * enforce First Never Always camel cased * nits * refactor * update test as well * fmt * nits * properly error out * Update bindings/python/src/pre_tokenizers.rs Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com> * suggestion changes --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -11,7 +11,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
|
||||
use tk::pre_tokenizers::byte_level::ByteLevel;
|
||||
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
|
||||
use tk::pre_tokenizers::digits::Digits;
|
||||
use tk::pre_tokenizers::metaspace::Metaspace;
|
||||
use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
|
||||
use tk::pre_tokenizers::punctuation::Punctuation;
|
||||
use tk::pre_tokenizers::split::Split;
|
||||
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
|
||||
@ -452,6 +452,21 @@ impl PySequence {
|
||||
}
|
||||
}
|
||||
|
||||
fn from_string(string: String) -> Result<PrependScheme, PyErr> {
|
||||
let scheme = match string.as_str() {
|
||||
"first" => PrependScheme::First,
|
||||
"never" => PrependScheme::Never,
|
||||
"always" => PrependScheme::Always,
|
||||
_ => {
|
||||
return Err(exceptions::PyValueError::new_err(format!(
|
||||
"{} is an unknown variant, should be one of ['first', 'never', 'always']",
|
||||
string
|
||||
)));
|
||||
}
|
||||
};
|
||||
Ok(scheme)
|
||||
}
|
||||
|
||||
/// Metaspace pre-tokenizer
|
||||
///
|
||||
/// This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
@ -489,17 +504,44 @@ impl PyMetaspace {
|
||||
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
|
||||
}
|
||||
|
||||
#[getter]
|
||||
fn get_prepend_scheme(self_: PyRef<Self>) -> String {
|
||||
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
||||
let scheme: PrependScheme = getter!(self_, Metaspace, get_prepend_scheme());
|
||||
match scheme {
|
||||
PrependScheme::First => "first",
|
||||
PrependScheme::Never => "never",
|
||||
PrependScheme::Always => "always",
|
||||
}
|
||||
.to_string()
|
||||
}
|
||||
|
||||
#[setter]
|
||||
fn set_prepend_scheme(self_: PyRef<Self>, prepend_scheme: String) -> PyResult<()> {
|
||||
let scheme = from_string(prepend_scheme)?;
|
||||
setter!(self_, Metaspace, @set_prepend_scheme, scheme);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[new]
|
||||
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
|
||||
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, prepend_scheme=None, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
|
||||
fn new(
|
||||
replacement: PyChar,
|
||||
add_prefix_space: bool,
|
||||
prepend_scheme: Option<String>,
|
||||
_kwargs: Option<&PyDict>,
|
||||
) -> (Self, PyPreTokenizer) {
|
||||
(
|
||||
PyMetaspace {},
|
||||
Metaspace::new(replacement.0, add_prefix_space).into(),
|
||||
)
|
||||
) -> PyResult<(Self, PyPreTokenizer)> {
|
||||
// Create a new Metaspace instance
|
||||
let mut new_instance: Metaspace = Metaspace::new(replacement.0, add_prefix_space);
|
||||
|
||||
// If a prepend scheme is provided, set it
|
||||
if let Some(prepend_scheme) = prepend_scheme {
|
||||
match from_string(prepend_scheme) {
|
||||
Ok(prepend_scheme_enum) => new_instance.set_prepend_scheme(prepend_scheme_enum),
|
||||
Err(err) => return Err(err),
|
||||
}
|
||||
}
|
||||
Ok((PyMetaspace {}, new_instance.into()))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -110,6 +110,8 @@ class TestMetaspace:
|
||||
assert pretok.replacement == "%"
|
||||
pretok.add_prefix_space = True
|
||||
assert pretok.add_prefix_space == True
|
||||
pretok.prepend_scheme = "never"
|
||||
assert pretok.prepend_scheme == "never"
|
||||
|
||||
|
||||
class TestCharDelimiterSplit:
|
||||
|
Reference in New Issue
Block a user