AddedVocabulary - Add tests, update bindings + various tweaks

2025-12-05 20:28:22 +00:00 · 2020-06-16 22:34:59 -04:00
parent c6f633eb1c
commit fc63d56eab
12 changed files with 326 additions and 91 deletions
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -29,7 +29,7 @@ impl AddedToken {
    #[new]
    #[args(kwargs = "**")]
    fn new(content: &str, is_special_token: bool, kwargs: Option<&PyDict>) -> PyResult<Self> {
-        let mut token = tk::tokenizer::AddedToken::from(content.to_owned(), is_special_token);
+        let mut token = tk::tokenizer::AddedToken::from(content, is_special_token);

        if let Some(kwargs) = kwargs {
            for (key, value) in kwargs {
--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@@ -200,7 +200,13 @@ class AddedToken:
    """

    def __new__(
-        cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
+        cls,
+        content: str,
+        is_special_token: bool,
+        single_word: bool = False,
+        lstrip: bool = False,
+        rstrip: bool = False,
+        normalized: bool = True,
    ) -> AddedToken:
        """ Instantiate a new AddedToken

@@ -208,19 +214,30 @@ class AddedToken:
            content: str:
                The content of the token

+            is_special_token: bool:
+                Whether this token is a special token. This has an impact on the default value for
+                `normalized` which is False for special tokens, but True for others.
+
            single_word: bool
-                Whether this token should only match against single word. If True,
-                this token will never match inside of a word.
+                Whether this token should only match against single words. If True,
+                this token will never match inside of a word. For example the token `ing` would
+                match on `tokenizing` if this option if False, but not if this option is True.

            lstrip: bool
                Whether this token should strip all potential whitespaces on the left side.
-                If True, this token will greedily match any whitespace on the left and then strip
-                them out.
+                If True, this token will greedily match any whitespace on the left. For example,
+                if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
+                we will match on ` [MASK]`.

            rstrip: bool
                Whether this token should strip all potential whitespaces on the right side.
-                If True, this token will greedily match any whitespace on the right and then strip
-                them out.
+                If True, this token will greedily match any whitespace on the right. It works just
+                like lstrip, but on the right.
+
+            normalized: bool:
+                Whether this token should be match the normalized version of the input text. For
+                example, with the added token `yesterday` and a normalizer in charge of lowercasing
+                the text, the token could be extract from the input `I saw a lion Yesterday`.
        """
        pass