remove enforcement of non special when adding tokens (#1521)

* remove enforcement of non special when adding tokens

* mut no longer needed

* add a small test

* nit

* style

* audit

* ignore cargo audit's own vulnerability

* update

* revert

* remove CVE
This commit is contained in:
Arthur
2024-04-30 15:53:47 +02:00
committed by GitHub
parent 71c2a8d01a
commit f2ec3b239b
4 changed files with 19 additions and 2 deletions

View File

@ -1151,8 +1151,7 @@ impl PyTokenizer {
.map(|token| {
if let Ok(content) = token.extract::<String>() {
Ok(PyAddedToken::from(content, Some(false)).get_token())
} else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
token.special = false;
} else if let Ok(token) = token.extract::<PyRefMut<PyAddedToken>>() {
Ok(token.get_token())
} else {
Err(exceptions::PyTypeError::new_err(

View File

@ -535,3 +535,15 @@ class TestTokenizer:
"▁▁▁▁▁▁",
"▁.",
]
def test_decode_special(self):
tokenizer = Tokenizer(BPE())
tokenizer.add_tokens([AddedToken("my", special=True), AddedToken("name", special=False), "is", "john", "pair"])
# Can decode single sequences
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=False)
assert output == "my name is john"
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True)
assert output == "name is john"
assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)