Encode special tokens (#1437)

* add doc in the code

* add option to skip special tokens

* nits

* add api dummy for now

* Fmt.

* Fix fmt.

* Fix the stub.

* add a test

* add a test in python

* style it

* nits

* add getter and setters

* stub

* update python test

* fmt

* last nit

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
Arthur
2024-01-19 12:43:43 +01:00
committed by GitHub
parent 888dd4bc65
commit 6a77d4859b
5 changed files with 173 additions and 0 deletions

View File

@ -836,6 +836,18 @@ class Tokenizer:
Returns:
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
"""
pass
@property
def encode_special_tokens(self):
"""
Modifies the tokenizer in order to use or not the special tokens
during encoding.
Args:
value (:obj:`bool`):
Whether to use the special tokens or not
"""
pass
@staticmethod

View File

@ -1109,6 +1109,25 @@ impl PyTokenizer {
self.tokenizer.id_to_token(id)
}
/// Modifies the tokenizer in order to use or not the special tokens
/// during encoding.
///
/// Args:
/// value (:obj:`bool`):
/// Whether to use the special tokens or not
///
#[setter]
fn set_encode_special_tokens(&mut self, value: bool) {
self.tokenizer.set_encode_special_tokens(value);
}
/// Get the value of the `encode_special_tokens` attribute
///
/// Returns:
/// :obj:`bool`: the tokenizer's encode_special_tokens attribute
#[getter]
fn get_encode_special_tokens(&self) -> bool {
self.tokenizer.get_encode_special_tokens()
}
/// Add the given tokens to the vocabulary
///
/// The given tokens are added only if they don't already exist in the vocabulary.

View File

@ -457,3 +457,34 @@ class TestTokenizer:
output = tokenizer.encode("A sentence 🤗")
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
def test_encode_special_tokens(self):
tokenizer = Tokenizer.from_pretrained("t5-base")
tokenizer.add_tokens(["<eot>"])
tokenizer.add_special_tokens(["<end_of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"]
tokenizer.encode_special_tokens = True
assert tokenizer.encode_special_tokens == True
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == [
"▁Hey",
"▁there",
"<",
"end",
"_",
"of",
"_",
"text",
">",
"▁dear",
"<eot>",
"▁friend",
"!",
]
tokenizer.add_tokens(["of_text>"])
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]