diff --git a/bindings/python/src/tokenizer.rs b/bindings/python/src/tokenizer.rs index db87ce2c..e3a87df4 100644 --- a/bindings/python/src/tokenizer.rs +++ b/bindings/python/src/tokenizer.rs @@ -337,6 +337,7 @@ impl Tokenizer { #[args(kwargs = "**")] fn enable_padding(&mut self, kwargs: Option<&PyDict>) -> PyResult<()> { let mut direction = PaddingDirection::Right; + let mut pad_to_multiple_of: Option = None; let mut pad_id: u32 = 0; let mut pad_type_id: u32 = 0; let mut pad_token = String::from("[PAD]"); @@ -359,6 +360,7 @@ impl Tokenizer { .into_pyerr()), }?; } + "pad_to_multiple_of" => pad_to_multiple_of = value.extract()?, "pad_id" => pad_id = value.extract()?, "pad_type_id" => pad_type_id = value.extract()?, "pad_token" => pad_token = value.extract()?, @@ -377,6 +379,7 @@ impl Tokenizer { self.tokenizer.with_padding(Some(PaddingParams { strategy, direction, + pad_to_multiple_of, pad_id, pad_type_id, pad_token: pad_token.to_owned(), diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi index dcde5063..d7e83deb 100644 --- a/bindings/python/tokenizers/__init__.pyi +++ b/bindings/python/tokenizers/__init__.pyi @@ -395,6 +395,7 @@ class Tokenizer: def enable_padding( self, direction: Optional[str] = "right", + pad_to_multiple_of: Optional[int] = None, pad_id: Optional[int] = 0, pad_type_id: Optional[int] = 0, pad_token: Optional[str] = "[PAD]", @@ -406,6 +407,11 @@ class Tokenizer: direction: (`optional`) str: Can be one of: `right` or `left` + pad_to_multiple_of: (`optional`) unsigned int: + If specified, the padding length should always snap to the next multiple of + the given value. For example if we were going to pad with a length of 250 but + `pad_to_multiple_of=8` then we will pad to 256. + pad_id: (`optional`) unsigned int: The indice to be used when padding diff --git a/bindings/python/tokenizers/implementations/base_tokenizer.py b/bindings/python/tokenizers/implementations/base_tokenizer.py index 7d6fe87e..6b86bb0e 100644 --- a/bindings/python/tokenizers/implementations/base_tokenizer.py +++ b/bindings/python/tokenizers/implementations/base_tokenizer.py @@ -52,6 +52,7 @@ class BaseTokenizer: def enable_padding( self, direction: Optional[str] = "right", + pad_to_multiple_of: Optional[int] = None, pad_id: Optional[int] = 0, pad_type_id: Optional[int] = 0, pad_token: Optional[str] = "[PAD]", @@ -63,6 +64,11 @@ class BaseTokenizer: direction: (`optional`) str: Can be one of: `right` or `left` + pad_to_multiple_of: (`optional`) unsigned int: + If specified, the padding length should always snap to the next multiple of + the given value. For example if we were going to pad with a length of 250 but + `pad_to_multiple_of=8` then we will pad to 256. + pad_id: (`optional`) unsigned int: The indice to be used when padding