[pre_tokenizers] Fix sentencepiece based Metaspace (#1357)

* nits

* allow for legacy beahaviour without making any breaking changes

* add a todo

* set to legacy by default

* skip legacy serialization

* push correct update

* lint

* add deserialization test

* add a python test as well

* updates

* fix serialization tests

* nits

* python stylijng of the tests

* better tests

* fix offsets

* fix imports

* fmt

* update metaspace

* remove TODO

* use enm

* fix some tses

* nits

* use enum

* update tests

* syling

* remove impl from for PrependScheme

* use simple getters and setters

* lint

* update tests

* add test new == new_with_prepend_scheme

* revert a change

* use setters and getterts

* Update bindings/python/src/pre_tokenizers.rs

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>

* nits

* use copy rather than ref

* nits format

* more nits

* allow option string

* enforce First Never Always camel cased

* nits

* refactor

* update test as well

* fmt

* nits

* properly error out

* Update bindings/python/src/pre_tokenizers.rs

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>

* suggestion changes

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2023-11-14 18:05:07 +01:00
committed by GitHub
parent ee2af9e99a
commit f55822baea
5 changed files with 257 additions and 17 deletions

View File

@ -11,7 +11,7 @@ use tk::pre_tokenizers::bert::BertPreTokenizer;
use tk::pre_tokenizers::byte_level::ByteLevel;
use tk::pre_tokenizers::delimiter::CharDelimiterSplit;
use tk::pre_tokenizers::digits::Digits;
use tk::pre_tokenizers::metaspace::Metaspace;
use tk::pre_tokenizers::metaspace::{Metaspace, PrependScheme};
use tk::pre_tokenizers::punctuation::Punctuation;
use tk::pre_tokenizers::split::Split;
use tk::pre_tokenizers::unicode_scripts::UnicodeScripts;
@ -452,6 +452,21 @@ impl PySequence {
}
}
fn from_string(string: String) -> Result<PrependScheme, PyErr> {
let scheme = match string.as_str() {
"first" => PrependScheme::First,
"never" => PrependScheme::Never,
"always" => PrependScheme::Always,
_ => {
return Err(exceptions::PyValueError::new_err(format!(
"{} is an unknown variant, should be one of ['first', 'never', 'always']",
string
)));
}
};
Ok(scheme)
}
/// Metaspace pre-tokenizer
///
/// This pre-tokenizer replaces any whitespace by the provided replacement character.
@ -489,17 +504,44 @@ impl PyMetaspace {
setter!(self_, Metaspace, add_prefix_space, add_prefix_space);
}
#[getter]
fn get_prepend_scheme(self_: PyRef<Self>) -> String {
// Assuming Metaspace has a method to get the prepend_scheme as a string
let scheme: PrependScheme = getter!(self_, Metaspace, get_prepend_scheme());
match scheme {
PrependScheme::First => "first",
PrependScheme::Never => "never",
PrependScheme::Always => "always",
}
.to_string()
}
#[setter]
fn set_prepend_scheme(self_: PyRef<Self>, prepend_scheme: String) -> PyResult<()> {
let scheme = from_string(prepend_scheme)?;
setter!(self_, Metaspace, @set_prepend_scheme, scheme);
Ok(())
}
#[new]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
#[pyo3(signature = (replacement = PyChar('▁'), add_prefix_space = true, prepend_scheme=None, **_kwargs), text_signature = "(self, replacement=\"_\", add_prefix_space=True)")]
fn new(
replacement: PyChar,
add_prefix_space: bool,
prepend_scheme: Option<String>,
_kwargs: Option<&PyDict>,
) -> (Self, PyPreTokenizer) {
(
PyMetaspace {},
Metaspace::new(replacement.0, add_prefix_space).into(),
)
) -> PyResult<(Self, PyPreTokenizer)> {
// Create a new Metaspace instance
let mut new_instance: Metaspace = Metaspace::new(replacement.0, add_prefix_space);
// If a prepend scheme is provided, set it
if let Some(prepend_scheme) = prepend_scheme {
match from_string(prepend_scheme) {
Ok(prepend_scheme_enum) => new_instance.set_prepend_scheme(prepend_scheme_enum),
Err(err) => return Err(err),
}
}
Ok((PyMetaspace {}, new_instance.into()))
}
}

View File

@ -110,6 +110,8 @@ class TestMetaspace:
assert pretok.replacement == "%"
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
pretok.prepend_scheme = "never"
assert pretok.prepend_scheme == "never"
class TestCharDelimiterSplit: