mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 13:18:31 +00:00
BertNormalizer has same behavior than original implem
This commit is contained in:
@@ -51,7 +51,7 @@ impl BertNormalizer {
|
||||
fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, Normalizer)> {
|
||||
let mut clean_text = true;
|
||||
let mut handle_chinese_chars = true;
|
||||
let mut strip_accents = true;
|
||||
let mut strip_accents = None;
|
||||
let mut lowercase = true;
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
|
||||
@@ -21,7 +21,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
|
||||
mask_token: Union[str, AddedToken] = "[MASK]",
|
||||
clean_text: bool = True,
|
||||
handle_chinese_chars: bool = True,
|
||||
strip_accents: bool = True,
|
||||
strip_accents: Optional[bool] = None,
|
||||
lowercase: bool = True,
|
||||
wordpieces_prefix: str = "##",
|
||||
):
|
||||
|
||||
@@ -18,7 +18,7 @@ class BertNormalizer(Normalizer):
|
||||
self,
|
||||
clean_text: Optional[bool] = True,
|
||||
handle_chinese_chars: Optional[bool] = True,
|
||||
strip_accents: Optional[bool] = True,
|
||||
strip_accents: Optional[bool] = None,
|
||||
lowercase: Optional[bool] = True,
|
||||
) -> None:
|
||||
""" Instantiate a BertNormalizer with the given options.
|
||||
@@ -32,7 +32,8 @@ class BertNormalizer(Normalizer):
|
||||
Whether to handle chinese chars by putting spaces around them.
|
||||
|
||||
strip_accents: (`optional`) boolean:
|
||||
Whether to strip all accents.
|
||||
Whether to strip all accents. If this option is not specified (ie == None),
|
||||
then it will be determined by the value for `lowercase` (as in the original Bert).
|
||||
|
||||
lowercase: (`optional`) boolean:
|
||||
Whether to lowercase.
|
||||
|
||||
Reference in New Issue
Block a user