diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index c0ebcf81..400c0ad4 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -288,21 +288,8 @@ pub struct PyDigits {} #[pymethods] impl PyDigits { #[new] - #[args(kwargs = "**")] - fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyPreTokenizer)> { - let mut individual_digits = false; - - if let Some(kwargs) = kwargs { - for (key, value) in kwargs { - let key: &str = key.extract()?; - match key { - "individual_digits" => { - individual_digits = value.extract()?; - } - _ => println!("Ignored unknown kwarg option {}", key), - } - } - } + #[args(individual_digits = false)] + fn new(individual_digits: bool) -> PyResult<(Self, PyPreTokenizer)> { Ok((PyDigits {}, Digits::new(individual_digits).into())) } } diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index 6d21167f..5345352c 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -116,6 +116,6 @@ class TestDigits: assert Digits() is not None assert isinstance(Digits(), PreTokenizer) assert isinstance(Digits(), Digits) - assert isinstance(Digits(individual_digits=True), Digits) - assert isinstance(Digits(individual_digits=False), Digits) + assert isinstance(Digits(True), Digits) + assert isinstance(Digits(False), Digits) assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits) diff --git a/tokenizers/src/pre_tokenizers/digits.rs b/tokenizers/src/pre_tokenizers/digits.rs index 7c59cae1..5ebd763b 100644 --- a/tokenizers/src/pre_tokenizers/digits.rs +++ b/tokenizers/src/pre_tokenizers/digits.rs @@ -3,8 +3,8 @@ use serde::{Deserialize, Serialize}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; #[derive(Serialize, Deserialize, Clone, Debug)] -/// Replaces all the whitespaces by the provided meta character and then -/// splits on this character +/// Pre tokenizes the numbers into single tokens. If individual_digits is set +/// to true, then all digits are splitted into individual tokens. #[serde(tag = "type")] pub struct Digits { individual_digits: bool,