BertNormalizer has same behavior than original implem

This commit is contained in:
Anthony MOI
2020-07-06 13:32:19 -04:00
parent b91deeaa3d
commit 7a95ffc4fa
6 changed files with 13 additions and 11 deletions

View File

@@ -51,7 +51,7 @@ impl BertNormalizer {
fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, Normalizer)> {
let mut clean_text = true;
let mut handle_chinese_chars = true;
let mut strip_accents = true;
let mut strip_accents = None;
let mut lowercase = true;
if let Some(kwargs) = kwargs {

View File

@@ -21,7 +21,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
mask_token: Union[str, AddedToken] = "[MASK]",
clean_text: bool = True,
handle_chinese_chars: bool = True,
strip_accents: bool = True,
strip_accents: Optional[bool] = None,
lowercase: bool = True,
wordpieces_prefix: str = "##",
):

View File

@@ -18,7 +18,7 @@ class BertNormalizer(Normalizer):
self,
clean_text: Optional[bool] = True,
handle_chinese_chars: Optional[bool] = True,
strip_accents: Optional[bool] = True,
strip_accents: Optional[bool] = None,
lowercase: Optional[bool] = True,
) -> None:
""" Instantiate a BertNormalizer with the given options.
@@ -32,7 +32,8 @@ class BertNormalizer(Normalizer):
Whether to handle chinese chars by putting spaces around them.
strip_accents: (`optional`) boolean:
Whether to strip all accents.
Whether to strip all accents. If this option is not specified (ie == None),
then it will be determined by the value for `lowercase` (as in the original Bert).
lowercase: (`optional`) boolean:
Whether to lowercase.