Python - encode & encode batch with add_special_tokens

2025-12-09 06:08:22 +00:00 · 2020-03-10 16:21:10 -04:00
parent 9e3d69389d
commit 257360acec
4 changed files with 50 additions and 14 deletions
--- a/bindings/python/CHANGELOG.md
+++ b/bindings/python/CHANGELOG.md
@@ -7,6 +7,8 @@ a high number of files as it avoids having too many progress bars on screen.
 avoids the unintuitive inclusion of the whitespaces in the produced offsets, even if these
 whitespaces are part of the actual token.
 It has been added to `ByteLevelBPETokenizer` and but it is off by default (`trim_offsets=False`).
 - `encode` and `encode_batch` no take a new optional argument, specifying whether we should add the
 special tokens. This stays activated by default.
 ## Fixes:
 - Fix some issues with the offsets being wrong with the `ByteLevel` BPE:
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@@ -159,20 +159,34 @@ impl Tokenizer {
        self.tokenizer.with_padding(None);
    }
-    fn encode(&self, sentence: &str, pair: Option<&str>) -> PyResult<Encoding> {
+    #[args(add_special_tokens = true)]
    fn encode(
        &self,
        sentence: &str,
        pair: Option<&str>,
        add_special_tokens: bool,
    ) -> PyResult<Encoding> {
        ToPyResult(
            self.tokenizer
-                .encode(if let Some(pair) = pair {
+                .encode(
-                    tk::tokenizer::EncodeInput::Dual(sentence.to_owned(), pair.to_owned())
+                    if let Some(pair) = pair {
-                } else {
+                        tk::tokenizer::EncodeInput::Dual(sentence.to_owned(), pair.to_owned())
-                    tk::tokenizer::EncodeInput::Single(sentence.to_owned())
+                    } else {
-                })
+                        tk::tokenizer::EncodeInput::Single(sentence.to_owned())
                    },
                    add_special_tokens,
                )
                .map(Encoding::new),
        )
        .into()
    }
-    fn encode_batch(&self, sentences: &PyList) -> PyResult<Vec<Encoding>> {
+    #[args(add_special_tokens = true)]
    fn encode_batch(
        &self,
        sentences: &PyList,
        add_special_tokens: bool,
    ) -> PyResult<Vec<Encoding>> {
        let inputs = sentences
            .into_iter()
            .map(|item| {
@@ -190,7 +204,7 @@ impl Tokenizer {
        ToPyResult(
            self.tokenizer
-                .encode_batch(inputs)
+                .encode_batch(inputs, add_special_tokens)
                .map(|encodings| encodings.into_iter().map(Encoding::new).collect()),
        )
        .into()
--- a/bindings/python/tokenizers/init.pyi
+++ b/bindings/python/tokenizers/init.pyi
@@ -244,7 +244,9 @@ class Tokenizer:
    def no_padding(self):
        """ Disable padding """
        pass
-    def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
+    def encode(
        self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
    ) -> Encoding:
        """ Encode the given sequence
        Args:
@@ -254,11 +256,16 @@ class Tokenizer:
            pair: (`optional`) Optional[str]:
                The optional pair sequence
            add_special_tokens: bool:
                Whether to add the special tokens while encoding
        Returns:
            An Encoding
        """
        pass
-    def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
+    def encode_batch(
        self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
    ) -> List[Encoding]:
        """ Encode the given sequences or pair of sequences
        Args:
@@ -266,6 +273,9 @@ class Tokenizer:
                A list of sequences or pair of sequences. The list can contain both
                at the same time.
            add_special_tokens: bool:
                Whether to add the special tokens while encoding
        Returns:
            A list of Encoding
        """
--- a/bindings/python/tokenizers/implementations/base_tokenizer.py
+++ b/bindings/python/tokenizers/implementations/base_tokenizer.py
@@ -125,7 +125,9 @@ class BaseTokenizer:
        """
        return self._tokenizer.add_special_tokens(special_tokens)
-    def encode(self, sequence: str, pair: Optional[str] = None) -> Encoding:
+    def encode(
        self, sequence: str, pair: Optional[str] = None, add_special_tokens: bool = True
    ) -> Encoding:
        """ Encode the given sequence
        Args:
@@ -135,12 +137,17 @@ class BaseTokenizer:
            pair: (`optional`) Optional[str]:
                The optional pair sequence
            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
        Returns:
            An Encoding
        """
-        return self._tokenizer.encode(sequence, pair)
+        return self._tokenizer.encode(sequence, pair, add_special_tokens)
-    def encode_batch(self, sequences: List[Union[str, Tuple[str, str]]]) -> List[Encoding]:
+    def encode_batch(
        self, sequences: List[Union[str, Tuple[str, str]]], add_special_tokens: bool = True
    ) -> List[Encoding]:
        """ Encode the given sequences or pair of sequences
        Args:
@@ -148,10 +155,13 @@ class BaseTokenizer:
                A list of sequences or pair of sequences. The list can contain both
                at the same time.
            add_special_tokens: bool:
                Whether to add the special tokens while encoding.
        Returns:
            A list of Encoding
        """
-        return self._tokenizer.encode_batch(sequences)
+        return self._tokenizer.encode_batch(sequences, add_special_tokens)
    def decode(self, ids: List[int], skip_special_tokens: Optional[bool] = True) -> str:
        """ Decode the given list of ids to a string sequence