mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Encode special tokens (#1437)
* add doc in the code * add option to skip special tokens * nits * add api dummy for now * Fmt. * Fix fmt. * Fix the stub. * add a test * add a test in python * style it * nits * add getter and setters * stub * update python test * fmt * last nit --------- Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -836,6 +836,18 @@ class Tokenizer:
|
||||
Returns:
|
||||
A :obj:`List` of :class:`~tokenizers.Encoding`: The encoded batch
|
||||
|
||||
"""
|
||||
pass
|
||||
@property
|
||||
def encode_special_tokens(self):
|
||||
"""
|
||||
Modifies the tokenizer in order to use or not the special tokens
|
||||
during encoding.
|
||||
|
||||
Args:
|
||||
value (:obj:`bool`):
|
||||
Whether to use the special tokens or not
|
||||
|
||||
"""
|
||||
pass
|
||||
@staticmethod
|
||||
|
@ -1109,6 +1109,25 @@ impl PyTokenizer {
|
||||
self.tokenizer.id_to_token(id)
|
||||
}
|
||||
|
||||
/// Modifies the tokenizer in order to use or not the special tokens
|
||||
/// during encoding.
|
||||
///
|
||||
/// Args:
|
||||
/// value (:obj:`bool`):
|
||||
/// Whether to use the special tokens or not
|
||||
///
|
||||
#[setter]
|
||||
fn set_encode_special_tokens(&mut self, value: bool) {
|
||||
self.tokenizer.set_encode_special_tokens(value);
|
||||
}
|
||||
/// Get the value of the `encode_special_tokens` attribute
|
||||
///
|
||||
/// Returns:
|
||||
/// :obj:`bool`: the tokenizer's encode_special_tokens attribute
|
||||
#[getter]
|
||||
fn get_encode_special_tokens(&self) -> bool {
|
||||
self.tokenizer.get_encode_special_tokens()
|
||||
}
|
||||
/// Add the given tokens to the vocabulary
|
||||
///
|
||||
/// The given tokens are added only if they don't already exist in the vocabulary.
|
||||
|
@ -457,3 +457,34 @@ class TestTokenizer:
|
||||
output = tokenizer.encode("A sentence 🤗")
|
||||
assert output.ids == [1, 10, 2, 3, 4, 5, 10, 6, 7, 8, 9]
|
||||
assert output.tokens == ["A", " ", "sen", "te", "n", "ce", " ", "<0xF0>", "<0x9F>", "<0xA4>", "<0x97>"]
|
||||
|
||||
def test_encode_special_tokens(self):
|
||||
tokenizer = Tokenizer.from_pretrained("t5-base")
|
||||
tokenizer.add_tokens(["<eot>"])
|
||||
tokenizer.add_special_tokens(["<end_of_text>"])
|
||||
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
|
||||
assert output.tokens == ["▁Hey", "▁there", "<end_of_text>", "▁dear", "<eot>", "▁friend", "!"]
|
||||
|
||||
tokenizer.encode_special_tokens = True
|
||||
assert tokenizer.encode_special_tokens == True
|
||||
|
||||
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
|
||||
assert output.tokens == [
|
||||
"▁Hey",
|
||||
"▁there",
|
||||
"<",
|
||||
"end",
|
||||
"_",
|
||||
"of",
|
||||
"_",
|
||||
"text",
|
||||
">",
|
||||
"▁dear",
|
||||
"<eot>",
|
||||
"▁friend",
|
||||
"!",
|
||||
]
|
||||
|
||||
tokenizer.add_tokens(["of_text>"])
|
||||
output = tokenizer.encode("Hey there<end_of_text> dear<eot>friend!", add_special_tokens=False)
|
||||
assert output.tokens == ["▁Hey", "▁there", "<", "end", "_", "of_text>", "▁dear", "<eot>", "▁friend", "!"]
|
||||
|
Reference in New Issue
Block a user