[BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder (#1513)

* [BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder Causes issues with `ByteLevel` messing up some `AddedTokens` with some utf-8 range used in the bytelevel mapping. This commit tests the extend of the damage of ignoring the decoder for those tokens. * Format. * Installing cargo audit. * Minor fix. * Fixing "bug" in node/python. * Autoformat. * Clippy. * Only prefix space when there's no decoder.
2025-08-22 16:25:30 +00:00 · 2024-05-06 11:49:38 +02:00
parent f2ec3b239b
commit 25aee8b88c
4 changed files with 51 additions and 16 deletions
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@ -63,6 +63,12 @@ jobs:
          toolchain: stable
          components: rustfmt, clippy
      - name: Install audit
        uses: actions-rs/cargo@v1
        with:
          command: install
          args: cargo-audit
      - name: Install Python
        uses: actions/setup-python@v4
        with:
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@ -36,6 +36,12 @@ jobs:
          command: install
          args: cargo-readme
      - name: Install audit
        uses: actions-rs/cargo@v1
        with:
          command: install
          args: cargo-audit
      - name: Build
        uses: actions-rs/cargo@v1
        with:
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@ -216,6 +216,10 @@ impl AddedVocabulary {
    }
    /// Get the token matching the given id if it exists
    #[deprecated(
        since = "0.19.0",
        note = "please use `added_vocabulary.simple_id_to_token(id).or_else(|| model.id_to_token(id)` instead"
    )]
    pub fn id_to_token(&self, id: u32, model: &impl Model) -> Option<String> {
        self.added_tokens_map_r
            .get(&id)
@ -223,6 +227,10 @@ impl AddedVocabulary {
            .or_else(|| model.id_to_token(id))
    }
    pub fn simple_id_to_token(&self, id: u32) -> Option<String> {
        self.added_tokens_map_r.get(&id).map(|t| t.content.clone())
    }
    //
    pub fn set_encode_special_tokens(&mut self, value: bool) {
        self.encode_special_tokens = value;
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@ -699,7 +699,9 @@ where
    /// Converts an id to the corresponding token.
    pub fn id_to_token(&self, id: u32) -> Option<String> {
-        self.added_vocabulary.id_to_token(id, &self.model)
+        self.added_vocabulary
            .simple_id_to_token(id)
            .or_else(|| self.model.id_to_token(id))
    }
    /// set the added bocab's splitting scheme
@ -845,22 +847,35 @@ where
    /// Decode the given ids, back to a String
    pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
-        let tokens = ids
+        let mut result = String::with_capacity(ids.len());
-            .iter()
+        let mut chunks = Vec::with_capacity(ids.len());
-            .filter_map(|id| {
+        for id in ids {
-                self.added_vocabulary
+            if let Some(added_token) = self.added_vocabulary.simple_id_to_token(*id) {
-                    .id_to_token(*id, &self.model)
+                if skip_special_tokens && self.added_vocabulary.is_special_token(&added_token) {
-                    .filter(|token| {
+                    continue;
-                        !skip_special_tokens || !self.added_vocabulary.is_special_token(token)
+                }
-                    })
+                let text_chunk = if let Some(decoder) = &self.decoder {
-            })
+                    decoder.decode(chunks.clone())?
-            .collect::<Vec<_>>();
+                } else {
-
+                    chunks.join(" ")
-        if let Some(decoder) = &self.decoder {
+                };
-            decoder.decode(tokens)
+                result.push_str(&text_chunk);
-        } else {
+                if !result.is_empty() && self.decoder.is_none() {
-            Ok(tokens.join(" "))
+                    result.push(' ');
                }
                result.push_str(&added_token);
                chunks.clear();
            } else if let Some(token) = self.model.id_to_token(*id) {
                chunks.push(token);
            }
        }
        let text_chunk = if let Some(decoder) = &self.decoder {
            decoder.decode(chunks.clone())?
        } else {
            chunks.join(" ")
        };
        result.push_str(&text_chunk);
        Ok(result)
    }
 }