Fix stripping strings containing Unicode characters (#707)

* Strip seems to have been broken for a while on unicode strings.

- Includes a failing tests + fixed it.
- This function could maybe b optimized, we're scanning the string 3 times now.
  and once fully for chars.

* Update CHANGELOG.md

Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
Nicolas Patry
2021-05-24 22:49:59 +02:00
committed by GitHub
parent 4b7f8c2d7c
commit c046da7679
2 changed files with 15 additions and 1 deletions

View File

@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- [#686]: Fix SPM conversion process for whitespace deduplication
- [#707]: Fix stripping strings containing Unicode characters
### Added
- [#693]: Add a CTC Decoder for Wave2Vec models
@@ -317,6 +318,7 @@ delimiter (Works like `.split(delimiter)`)
- Fix a bug that was causing crashes in Python 3.5
[#707]: https://github.com/huggingface/tokenizers/pull/707
[#693]: https://github.com/huggingface/tokenizers/pull/693
[#686]: https://github.com/huggingface/tokenizers/pull/686
[#674]: https://github.com/huggingface/tokenizers/pull/674

View File

@@ -736,12 +736,13 @@ impl NormalizedString {
};
if leading_spaces > 0 || trailing_spaces > 0 {
let count = self.get().chars().count();
let transformation = self
.normalized
.chars()
.enumerate()
.filter_map(|(i, c)| {
if i < leading_spaces || i >= self.len() - trailing_spaces {
if i < leading_spaces || i >= count - trailing_spaces {
None
} else if i == self.len() - trailing_spaces - 1 {
Some((c, -(trailing_spaces as isize)))
@@ -1274,6 +1275,17 @@ mod tests {
);
}
#[test]
fn strip_unicode() {
let mut n = NormalizedString::from(" 你好asa \n");
n.strip();
assert_eq!(&n.normalized, "你好asa");
assert_eq!(
n.get_range_original(Range::Normalized(0..n.normalized.len())),
Some("你好asa")
);
}
#[test]
fn prepend() {
let mut n = NormalizedString::from("there");