mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-12 05:18:39 +00:00
Fix stripping strings containing Unicode characters (#707)
* Strip seems to have been broken for a while on unicode strings. - Includes a failing tests + fixed it. - This function could maybe b optimized, we're scanning the string 3 times now. and once fully for chars. * Update CHANGELOG.md Co-authored-by: Anthony MOI <m.anthony.moi@gmail.com>
This commit is contained in:
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- [#686]: Fix SPM conversion process for whitespace deduplication
|
- [#686]: Fix SPM conversion process for whitespace deduplication
|
||||||
|
- [#707]: Fix stripping strings containing Unicode characters
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- [#693]: Add a CTC Decoder for Wave2Vec models
|
- [#693]: Add a CTC Decoder for Wave2Vec models
|
||||||
@@ -317,6 +318,7 @@ delimiter (Works like `.split(delimiter)`)
|
|||||||
- Fix a bug that was causing crashes in Python 3.5
|
- Fix a bug that was causing crashes in Python 3.5
|
||||||
|
|
||||||
|
|
||||||
|
[#707]: https://github.com/huggingface/tokenizers/pull/707
|
||||||
[#693]: https://github.com/huggingface/tokenizers/pull/693
|
[#693]: https://github.com/huggingface/tokenizers/pull/693
|
||||||
[#686]: https://github.com/huggingface/tokenizers/pull/686
|
[#686]: https://github.com/huggingface/tokenizers/pull/686
|
||||||
[#674]: https://github.com/huggingface/tokenizers/pull/674
|
[#674]: https://github.com/huggingface/tokenizers/pull/674
|
||||||
|
|||||||
@@ -736,12 +736,13 @@ impl NormalizedString {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if leading_spaces > 0 || trailing_spaces > 0 {
|
if leading_spaces > 0 || trailing_spaces > 0 {
|
||||||
|
let count = self.get().chars().count();
|
||||||
let transformation = self
|
let transformation = self
|
||||||
.normalized
|
.normalized
|
||||||
.chars()
|
.chars()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.filter_map(|(i, c)| {
|
.filter_map(|(i, c)| {
|
||||||
if i < leading_spaces || i >= self.len() - trailing_spaces {
|
if i < leading_spaces || i >= count - trailing_spaces {
|
||||||
None
|
None
|
||||||
} else if i == self.len() - trailing_spaces - 1 {
|
} else if i == self.len() - trailing_spaces - 1 {
|
||||||
Some((c, -(trailing_spaces as isize)))
|
Some((c, -(trailing_spaces as isize)))
|
||||||
@@ -1274,6 +1275,17 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn strip_unicode() {
|
||||||
|
let mut n = NormalizedString::from(" 你好asa \n");
|
||||||
|
n.strip();
|
||||||
|
assert_eq!(&n.normalized, "你好asa");
|
||||||
|
assert_eq!(
|
||||||
|
n.get_range_original(Range::Normalized(0..n.normalized.len())),
|
||||||
|
Some("你好asa")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn prepend() {
|
fn prepend() {
|
||||||
let mut n = NormalizedString::from("there");
|
let mut n = NormalizedString::from("there");
|
||||||
|
|||||||
Reference in New Issue
Block a user