mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
remove enforcement of non special when adding tokens (#1521)
* remove enforcement of non special when adding tokens * mut no longer needed * add a small test * nit * style * audit * ignore cargo audit's own vulnerability * update * revert * remove CVE
This commit is contained in:
3
.github/workflows/python.yml
vendored
3
.github/workflows/python.yml
vendored
@ -95,6 +95,9 @@ jobs:
|
|||||||
command: clippy
|
command: clippy
|
||||||
args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings
|
args: --manifest-path ./bindings/python/Cargo.toml --all-targets --all-features -- -D warnings
|
||||||
|
|
||||||
|
- name: Install cargo-audit
|
||||||
|
run: cargo install cargo-audit
|
||||||
|
|
||||||
- name: Run Audit
|
- name: Run Audit
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
|
3
.github/workflows/rust.yml
vendored
3
.github/workflows/rust.yml
vendored
@ -81,6 +81,9 @@ jobs:
|
|||||||
command: test
|
command: test
|
||||||
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
|
args: --verbose --manifest-path ./tokenizers/Cargo.toml --doc
|
||||||
|
|
||||||
|
- name: Install cargo-audit
|
||||||
|
run: cargo install cargo-audit
|
||||||
|
|
||||||
- name: Run Audit
|
- name: Run Audit
|
||||||
uses: actions-rs/cargo@v1
|
uses: actions-rs/cargo@v1
|
||||||
with:
|
with:
|
||||||
|
@ -1151,8 +1151,7 @@ impl PyTokenizer {
|
|||||||
.map(|token| {
|
.map(|token| {
|
||||||
if let Ok(content) = token.extract::<String>() {
|
if let Ok(content) = token.extract::<String>() {
|
||||||
Ok(PyAddedToken::from(content, Some(false)).get_token())
|
Ok(PyAddedToken::from(content, Some(false)).get_token())
|
||||||
} else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
|
} else if let Ok(token) = token.extract::<PyRefMut<PyAddedToken>>() {
|
||||||
token.special = false;
|
|
||||||
Ok(token.get_token())
|
Ok(token.get_token())
|
||||||
} else {
|
} else {
|
||||||
Err(exceptions::PyTypeError::new_err(
|
Err(exceptions::PyTypeError::new_err(
|
||||||
|
@ -535,3 +535,15 @@ class TestTokenizer:
|
|||||||
"▁▁▁▁▁▁",
|
"▁▁▁▁▁▁",
|
||||||
"▁.",
|
"▁.",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def test_decode_special(self):
|
||||||
|
tokenizer = Tokenizer(BPE())
|
||||||
|
tokenizer.add_tokens([AddedToken("my", special=True), AddedToken("name", special=False), "is", "john", "pair"])
|
||||||
|
|
||||||
|
# Can decode single sequences
|
||||||
|
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=False)
|
||||||
|
assert output == "my name is john"
|
||||||
|
|
||||||
|
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True)
|
||||||
|
assert output == "name is john"
|
||||||
|
assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)
|
||||||
|
Reference in New Issue
Block a user