diff --git a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py index 02e61706..4528dceb 100644 --- a/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py +++ b/bindings/python/py_src/tokenizers/implementations/base_tokenizer.py @@ -42,6 +42,14 @@ class BaseTokenizer: """ return self._tokenizer.get_vocab(with_added_tokens=with_added_tokens) + def get_added_tokens_decoder(self) -> Dict[int, AddedToken]: + """Returns the added reverse vocabulary + + Returns: + The added vocabulary mapping ints to AddedTokens + """ + return self._tokenizer.get_added_tokens_decoder() + def get_vocab_size(self, with_added_tokens: bool = True) -> int: """Return the size of vocabulary, with or without added tokens. diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index a8ae60c7..a491a036 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -662,6 +662,17 @@ impl PyTokenizer { self.tokenizer.get_vocab(with_added_tokens) } + /// Get the underlying vocabulary + /// + /// Returns: + /// :obj:`Dict[int, AddedToken]`: The vocabulary + #[pyo3(signature = ())] + #[pyo3(text_signature = "(self)")] + fn get_added_tokens_decoder(&self) -> HashMap { + self.tokenizer.get_added_tokens_decoder().into_iter().map(|(key, value)| (key, value.into())).collect() + } + + /// Get the size of the underlying vocabulary /// /// Args: diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 5a60c29c..abfa8f90 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -373,6 +373,10 @@ class TestTokenizer: # Can retrieve vocab without added tokens vocab = tokenizer.get_vocab(with_added_tokens=False) assert vocab == {} + + # Can retrieve added token decoder + vocab = tokenizer.get_added_tokens_decoder() + assert vocab == {0: AddedToken("my", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),1: AddedToken("name", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),2: AddedToken("is", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),3: AddedToken("john", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),4: AddedToken("pair", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False)} def test_get_vocab_size(self): tokenizer = Tokenizer(BPE()) diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index 1d7d2961..77a72f62 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -192,7 +192,7 @@ impl AddedVocabulary { } /// Get the additional vocabulary with the AddedTokens - pub fn get_vocab_r(&self) -> &HashMap { + pub fn get_added_tokens_decoder(&self) -> &HashMap { &self.added_tokens_map_r } @@ -260,7 +260,7 @@ impl AddedVocabulary { self.added_tokens_map.values().cloned().max().map_or( model.get_vocab_size() as u32, |max| { - if max >= (model.get_vocab_size() as u32) || model.get_vocab_size() == 0 { + if (max >= model.get_vocab_size() as u32) || model.get_vocab_size() == 0 { max + 1 } else { model.get_vocab_size() as u32 diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 77fcec79..c76addf9 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -659,11 +659,9 @@ where final_vocab } - /// Get the added vocabulary only - /// Get the added tokens decoder - pub fn get_added_tokens_decoder(&self) -> &HashMap { - self.added_vocabulary.get_vocab_r() + pub fn get_added_tokens_decoder(&self) -> HashMap { + self.added_vocabulary.get_added_tokens_decoder().clone() } /// Get the size of the vocabulary