From f2ec3b239b0a7a9866b01ec5cbd4d44243a40a16 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:53:47 +0200 Subject: [PATCH] remove enforcement of non special when adding tokens (#1521) * remove enforcement of non special when adding tokens * mut no longer needed * add a small test * nit * style * audit * ignore cargo audit's own vulnerability * update * revert * remove CVE --- .github/workflows/python.yml | 3 +++ .github/workflows/rust.yml | 3 +++ bindings/python/src/tokenizer.rs | 3 +-- bindings/python/tests/bindings/test_tokenizer.py | 12 ++++++++++++ 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 675bfeff..529d892d 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -95,6 +95,9 @@ jobs: command: clippy args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings + - name: Install cargo-audit + run: cargo install cargo-audit + - name: Run Audit uses: actions-rs/cargo@v1 with: diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 8640c91d..1cc3ef7a 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -81,6 +81,9 @@ jobs: command: test args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc + - name: Install cargo-audit + run: cargo install cargo-audit + - name: Run Audit uses: actions-rs/cargo@v1 with: diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index c32619b5..1c6bc9cc 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -1151,8 +1151,7 @@ impl PyTokenizer { .map(|token| { if let Ok(content) = token.extract::() { Ok(PyAddedToken::from(content, Some(false)).get_token()) - } else if let Ok(mut token) = token.extract::>() { - token.special = false; + } else if let Ok(token) = token.extract::>() { Ok(token.get_token()) } else { Err(exceptions::PyTypeError::new_err( diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 3d04960e..3ac50e00 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -535,3 +535,15 @@ class TestTokenizer: "▁▁▁▁▁▁", "▁.", ] + + def test_decode_special(self): + tokenizer = Tokenizer(BPE()) + tokenizer.add_tokens([AddedToken("my", special=True), AddedToken("name", special=False), "is", "john", "pair"]) + + # Can decode single sequences + output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=False) + assert output == "my name is john" + + output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True) + assert output == "name is john" + assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)