Fix BPE continuing_subword_prefix

This commit is contained in:
Anthony MOI
2021-03-16 16:50:42 -04:00
committed by Anthony MOI
parent f5e9bb89b7
commit 0fe9214f44
2 changed files with 49 additions and 17 deletions

View File

@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- [#652]: Fix offsets for `Precompiled` corner case
- [#656]: Fix BPE `continuing_subword_prefix`
## [0.10.1]
@@ -307,6 +308,7 @@ delimiter (Works like `.split(delimiter)`)
- Fix a bug that was causing crashes in Python 3.5
[#656]: https://github.com/huggingface/tokenizers/pull/656
[#652]: https://github.com/huggingface/tokenizers/pull/652
[#621]: https://github.com/huggingface/tokenizers/pull/621
[#620]: https://github.com/huggingface/tokenizers/pull/620

View File

@@ -335,23 +335,29 @@ impl BPE {
let mut word = Word::with_capacity(w.len());
let mut unk: Option<(u32, usize)> = None;
while let Some(i) = indices.next() {
let (s, byte_len) = if let Some(&end) = indices.peek() {
match (i, self.continuing_subword_prefix.as_ref()) {
(0, Some(prefix)) => (
Cow::Owned(format!("{}{}", prefix, &w[i..end])),
(i..end).len(),
),
_ => (Cow::Borrowed(&w[i..end]), (i..end).len()),
}
let end = indices.peek();
let is_first = i == 0;
let is_last = end.is_none();
let mut s = if let Some(e) = end {
Cow::Borrowed(&w[i..*e])
} else {
(
self.end_of_word_suffix
.as_ref()
.map(|suffix| format!("{}{}", &w[i..], suffix).into())
.unwrap_or_else(|| Cow::Borrowed(&w[i..])),
w[i..].len(),
)
Cow::Borrowed(&w[i..])
};
let byte_len = s.len();
// Add the `continuing_subword_prefix` if relevant
if !is_first {
if let Some(ref prefix) = self.continuing_subword_prefix {
s = format!("{}{}", prefix, s).into()
}
}
// Add the `end_of_word_suffix` if relevant
if is_last {
if let Some(ref suffix) = self.end_of_word_suffix {
s = format!("{}{}", s, suffix).into()
}
}
if let Some(id) = self.vocab.get(s.as_ref()) {
if let Some((unk_id, unk_len)) = unk {
@@ -684,17 +690,41 @@ mod tests {
("##b".to_string(), 1),
("##c".to_string(), 2),
("ab".to_string(), 3),
("abc".to_string(), 4),
]
.into_iter()
.collect();
let merges = vec![("a".to_string(), "##b".to_string())];
let merges = vec![
("a".to_string(), "##b".to_string()),
("ab".to_string(), "##c".to_string()),
];
BPE::builder()
let bpe = BPE::builder()
.vocab_and_merges(vocab, merges)
.unk_token("[UNK]".to_string())
.continuing_subword_prefix("##".to_string())
.build()
.unwrap();
let res = bpe.tokenize("ab");
assert_eq!(
res.unwrap(),
vec![Token {
id: 3,
value: "ab".to_string(),
offsets: (0, 2)
}]
);
let res = bpe.tokenize("abc");
assert_eq!(
res.unwrap(),
vec![Token {
id: 4,
value: "abc".to_string(),
offsets: (0, 3)
}]
);
}
#[test]