mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
remove enforcement of non special when adding tokens (#1521)
* remove enforcement of non special when adding tokens * mut no longer needed * add a small test * nit * style * audit * ignore cargo audit's own vulnerability * update * revert * remove CVE
This commit is contained in:
3
.github/workflows/python.yml
vendored
3
.github/workflows/python.yml
vendored
@ -95,6 +95,9 @@ jobs:
|
||||
command: clippy
|
||||
args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings
|
||||
|
||||
- name: Install cargo-audit
|
||||
run: cargo install cargo-audit
|
||||
|
||||
- name: Run Audit
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
|
3
.github/workflows/rust.yml
vendored
3
.github/workflows/rust.yml
vendored
@ -81,6 +81,9 @@ jobs:
|
||||
command: test
|
||||
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
|
||||
|
||||
- name: Install cargo-audit
|
||||
run: cargo install cargo-audit
|
||||
|
||||
- name: Run Audit
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
|
@ -1151,8 +1151,7 @@ impl PyTokenizer {
|
||||
.map(|token| {
|
||||
if let Ok(content) = token.extract::<String>() {
|
||||
Ok(PyAddedToken::from(content, Some(false)).get_token())
|
||||
} else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
|
||||
token.special = false;
|
||||
} else if let Ok(token) = token.extract::<PyRefMut<PyAddedToken>>() {
|
||||
Ok(token.get_token())
|
||||
} else {
|
||||
Err(exceptions::PyTypeError::new_err(
|
||||
|
@ -535,3 +535,15 @@ class TestTokenizer:
|
||||
"▁▁▁▁▁▁",
|
||||
"▁.",
|
||||
]
|
||||
|
||||
def test_decode_special(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.add_tokens([AddedToken("my", special=True), AddedToken("name", special=False), "is", "john", "pair"])
|
||||
|
||||
# Can decode single sequences
|
||||
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=False)
|
||||
assert output == "my name is john"
|
||||
|
||||
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True)
|
||||
assert output == "name is john"
|
||||
assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)
|
||||
|
Reference in New Issue
Block a user