mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 12:48:18 +00:00
Addressing @n1t0's comments.
This commit is contained in:
@@ -288,21 +288,8 @@ pub struct PyDigits {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyDigits {
|
impl PyDigits {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(kwargs = "**")]
|
#[args(individual_digits = false)]
|
||||||
fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, PyPreTokenizer)> {
|
fn new(individual_digits: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
||||||
let mut individual_digits = false;
|
|
||||||
|
|
||||||
if let Some(kwargs) = kwargs {
|
|
||||||
for (key, value) in kwargs {
|
|
||||||
let key: &str = key.extract()?;
|
|
||||||
match key {
|
|
||||||
"individual_digits" => {
|
|
||||||
individual_digits = value.extract()?;
|
|
||||||
}
|
|
||||||
_ => println!("Ignored unknown kwarg option {}", key),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok((PyDigits {}, Digits::new(individual_digits).into()))
|
Ok((PyDigits {}, Digits::new(individual_digits).into()))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -116,6 +116,6 @@ class TestDigits:
|
|||||||
assert Digits() is not None
|
assert Digits() is not None
|
||||||
assert isinstance(Digits(), PreTokenizer)
|
assert isinstance(Digits(), PreTokenizer)
|
||||||
assert isinstance(Digits(), Digits)
|
assert isinstance(Digits(), Digits)
|
||||||
assert isinstance(Digits(individual_digits=True), Digits)
|
assert isinstance(Digits(True), Digits)
|
||||||
assert isinstance(Digits(individual_digits=False), Digits)
|
assert isinstance(Digits(False), Digits)
|
||||||
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
|
assert isinstance(pickle.loads(pickle.dumps(Digits())), Digits)
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ use serde::{Deserialize, Serialize};
|
|||||||
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
||||||
|
|
||||||
#[derive(Serialize, Deserialize, Clone, Debug)]
|
#[derive(Serialize, Deserialize, Clone, Debug)]
|
||||||
/// Replaces all the whitespaces by the provided meta character and then
|
/// Pre tokenizes the numbers into single tokens. If individual_digits is set
|
||||||
/// splits on this character
|
/// to true, then all digits are splitted into individual tokens.
|
||||||
#[serde(tag = "type")]
|
#[serde(tag = "type")]
|
||||||
pub struct Digits {
|
pub struct Digits {
|
||||||
individual_digits: bool,
|
individual_digits: bool,
|
||||||
|
|||||||
Reference in New Issue
Block a user