BertNormalizer has same behavior than original implem

2025-12-07 13:18:31 +00:00 · 2020-07-06 13:32:19 -04:00
parent b91deeaa3d
commit 7a95ffc4fa
6 changed files with 13 additions and 11 deletions
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -51,7 +51,7 @@ impl BertNormalizer {
    fn new(kwargs: Option<&PyDict>) -> PyResult<(Self, Normalizer)> {
        let mut clean_text = true;
        let mut handle_chinese_chars = true;
-        let mut strip_accents = true;
+        let mut strip_accents = None;
        let mut lowercase = true;

        if let Some(kwargs) = kwargs {
--- a/bindings/python/tokenizers/implementations/bert_wordpiece.py
+++ b/bindings/python/tokenizers/implementations/bert_wordpiece.py
@@ -21,7 +21,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
        mask_token: Union[str, AddedToken] = "[MASK]",
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
-        strip_accents: bool = True,
+        strip_accents: Optional[bool] = None,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
    ):
--- a/bindings/python/tokenizers/normalizers/init.pyi
+++ b/bindings/python/tokenizers/normalizers/init.pyi
@@ -18,7 +18,7 @@ class BertNormalizer(Normalizer):
        self,
        clean_text: Optional[bool] = True,
        handle_chinese_chars: Optional[bool] = True,
-        strip_accents: Optional[bool] = True,
+        strip_accents: Optional[bool] = None,
        lowercase: Optional[bool] = True,
    ) -> None:
        """ Instantiate a BertNormalizer with the given options.
@@ -32,7 +32,8 @@ class BertNormalizer(Normalizer):
                Whether to handle chinese chars by putting spaces around them.

            strip_accents: (`optional`) boolean:
-                Whether to strip all accents.
+                Whether to strip all accents. If this option is not specified (ie == None),
+                then it will be determined by the value for `lowercase` (as in the original Bert).

            lowercase: (`optional`) boolean:
                Whether to lowercase.