Making the regex in ByteLevel optional. (#939)

* Making the regex in ByteLevel optional. * Changed the stub. * Beter stub. * Typo fix. * Remove bad comments.
2025-12-03 11:18:29 +00:00 · 2022-03-18 09:03:20 +01:00
parent cdabef14c4
commit daa4dd2288
4 changed files with 84 additions and 7 deletions
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -102,7 +102,7 @@ class ByteLevel(PreTokenizer):
            lets us treat `hello` exactly like `say hello`.
    """

-    def __init__(self, add_prefix_space=True):
+    def __init__(self, add_prefix_space=True, use_regex=True):
        pass
    @staticmethod
    def alphabet():
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -229,7 +229,7 @@ macro_rules! setter {
 ///         Whether to add a space to the first word if there isn't already one. This
 ///         lets us treat `hello` exactly like `say hello`.
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name=ByteLevel)]
-#[text_signature = "(self, add_prefix_space=True)"]
+#[text_signature = "(self, add_prefix_space=True, use_regex=True)"]
 pub struct PyByteLevel {}
 #[pymethods]
 impl PyByteLevel {
@@ -243,13 +243,28 @@ impl PyByteLevel {
        setter!(self_, ByteLevel, add_prefix_space, add_prefix_space);
    }

+    #[getter]
+    fn get_use_regex(self_: PyRef<Self>) -> bool {
+        getter!(self_, ByteLevel, use_regex)
+    }
+
+    #[setter]
+    fn set_use_regex(self_: PyRef<Self>, use_regex: bool) {
+        setter!(self_, ByteLevel, use_regex, use_regex);
+    }
+
    #[new]
-    #[args(add_prefix_space = "true", _kwargs = "**")]
-    fn new(add_prefix_space: bool, _kwargs: Option<&PyDict>) -> (Self, PyPreTokenizer) {
+    #[args(add_prefix_space = "true", use_regex = "true", _kwargs = "**")]
+    fn new(
+        add_prefix_space: bool,
+        use_regex: bool,
+        _kwargs: Option<&PyDict>,
+    ) -> (Self, PyPreTokenizer) {
        (
            PyByteLevel {},
            ByteLevel::default()
                .add_prefix_space(add_prefix_space)
+                .use_regex(use_regex)
                .into(),
        )
    }