mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
remove enforcement of non special when adding tokens (#1521)
* remove enforcement of non special when adding tokens * mut no longer needed * add a small test * nit * style * audit * ignore cargo audit's own vulnerability * update * revert * remove CVE
This commit is contained in:
@ -1151,8 +1151,7 @@ impl PyTokenizer {
|
||||
.map(|token| {
|
||||
if let Ok(content) = token.extract::<String>() {
|
||||
Ok(PyAddedToken::from(content, Some(false)).get_token())
|
||||
} else if let Ok(mut token) = token.extract::<PyRefMut<PyAddedToken>>() {
|
||||
token.special = false;
|
||||
} else if let Ok(token) = token.extract::<PyRefMut<PyAddedToken>>() {
|
||||
Ok(token.get_token())
|
||||
} else {
|
||||
Err(exceptions::PyTypeError::new_err(
|
||||
|
@ -535,3 +535,15 @@ class TestTokenizer:
|
||||
"▁▁▁▁▁▁",
|
||||
"▁.",
|
||||
]
|
||||
|
||||
def test_decode_special(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.add_tokens([AddedToken("my", special=True), AddedToken("name", special=False), "is", "john", "pair"])
|
||||
|
||||
# Can decode single sequences
|
||||
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=False)
|
||||
assert output == "my name is john"
|
||||
|
||||
output = tokenizer.decode([0, 1, 2, 3], skip_special_tokens=True)
|
||||
assert output == "name is john"
|
||||
assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)
|
||||
|
Reference in New Issue
Block a user