Expose num_added_tokens on Python side (#146)

* Expose num_added_tokens on Python side without the need to pass an Encoding to added_tokens. This allows to compute the max sentence length for single/pair inputs without actually the need to have an Encoding structure. As the number of added tokens is fixed and static during compilation it allows more flexible usage of the method. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Renamed num_added_tokens to num_special_tokens_to_add. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
2025-12-07 13:18:31 +00:00 · 2020-02-14 10:55:20 +00:00
parent 5bd93ee822
commit c4bac6aeeb
8 changed files with 54 additions and 18 deletions
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -8,6 +8,13 @@ pub struct PostProcessor {
    pub processor: Container<dyn tk::tokenizer::PostProcessor + Sync>,
 }

+#[pymethods]
+impl PostProcessor {
+    fn num_special_tokens_to_add(&self, is_pair: bool) -> usize {
+        self.processor.execute(|p| p.added_tokens(is_pair))
+    }
+}
+
 #[pyclass(extends=PostProcessor)]
 pub struct BertProcessing {}
 #[pymethods]
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -38,6 +38,13 @@ impl Tokenizer {
        }
    }

+    fn num_special_tokens_to_add(&self, is_pair: bool) -> PyResult<usize> {
+        Ok(self.tokenizer
+               .get_post_processor()
+               .map_or(0, |p| p.as_ref().added_tokens(is_pair))
+        )
+    }
+
    #[args(kwargs = "**")]
    fn get_vocab_size(&self, kwargs: Option<&PyDict>) -> PyResult<usize> {
        let mut with_added_tokens = true;
--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@@ -201,6 +201,14 @@ class Tokenizer:
    def normalizer(self, normalizer: normalizers.Normalizer):
        """ Change the normalizer to use with this Tokenizer """

+    def num_special_tokens_to_add(self, is_pair: bool) -> int:
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        pass
+

    def get_vocab_size(self, with_added_tokens: Optional[bool]) -> int:
        """ Returns the size of the vocabulary
--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@@ -13,6 +13,14 @@ class BaseTokenizer:
            self._tokenizer.get_vocab_size(),
            ', '.join(k + '=' + str(v) for k, v in self._parameters.items()))

+    def num_special_tokens_to_add(self, is_pair: bool) -> int:
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        return self._tokenizer.num_added_tokens(is_pair)
+
    def get_vocab_size(self, with_added_tokens: bool = True):
        """ Return the size of vocabulary, with or without added tokens.

--- a/bindings/python/tokenizers/processors/init.pyi
+++ b/bindings/python/tokenizers/processors/init.pyi
@@ -7,6 +7,14 @@ class PostProcessor:
    a PostProcessor will return an instance of this class when instantiated.
    """

+    def num_special_tokens_to_add(self, is_pair: bool) -> int:
+        """
+        Return the number of special tokens that would be added for single/pair sentences.
+        :param is_pair: Boolean indicating if the input would be a single sentence or a pair
+        :return:
+        """
+        pass
+
 class BertProcessing(PostProcessor):
    """ BertProcessing