mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Expose num_added_tokens on Python side (#146)
* Expose num_added_tokens on Python side without the need to pass an Encoding to added_tokens. This allows to compute the max sentence length for single/pair inputs without actually the need to have an Encoding structure. As the number of added tokens is fixed and static during compilation it allows more flexible usage of the method. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co> * Renamed num_added_tokens to num_special_tokens_to_add. Signed-off-by: Morgan Funtowicz <morgan@huggingface.co>
This commit is contained in:
@ -8,6 +8,13 @@ pub struct PostProcessor {
|
||||
pub processor: Container<dyn tk::tokenizer::PostProcessor + Sync>,
|
||||
}
|
||||
|
||||
#[pymethods]
|
||||
impl PostProcessor {
|
||||
fn num_special_tokens_to_add(&self, is_pair: bool) -> usize {
|
||||
self.processor.execute(|p| p.added_tokens(is_pair))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PostProcessor)]
|
||||
pub struct BertProcessing {}
|
||||
#[pymethods]
|
||||
|
Reference in New Issue
Block a user