mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 11:18:29 +00:00
Fix BPE continuing_subword_prefix
This commit is contained in:
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
|
|
||||||
### Fixed
|
### Fixed
|
||||||
- [#652]: Fix offsets for `Precompiled` corner case
|
- [#652]: Fix offsets for `Precompiled` corner case
|
||||||
|
- [#656]: Fix BPE `continuing_subword_prefix`
|
||||||
|
|
||||||
## [0.10.1]
|
## [0.10.1]
|
||||||
|
|
||||||
@@ -307,6 +308,7 @@ delimiter (Works like `.split(delimiter)`)
|
|||||||
- Fix a bug that was causing crashes in Python 3.5
|
- Fix a bug that was causing crashes in Python 3.5
|
||||||
|
|
||||||
|
|
||||||
|
[#656]: https://github.com/huggingface/tokenizers/pull/656
|
||||||
[#652]: https://github.com/huggingface/tokenizers/pull/652
|
[#652]: https://github.com/huggingface/tokenizers/pull/652
|
||||||
[#621]: https://github.com/huggingface/tokenizers/pull/621
|
[#621]: https://github.com/huggingface/tokenizers/pull/621
|
||||||
[#620]: https://github.com/huggingface/tokenizers/pull/620
|
[#620]: https://github.com/huggingface/tokenizers/pull/620
|
||||||
|
|||||||
@@ -335,23 +335,29 @@ impl BPE {
|
|||||||
let mut word = Word::with_capacity(w.len());
|
let mut word = Word::with_capacity(w.len());
|
||||||
let mut unk: Option<(u32, usize)> = None;
|
let mut unk: Option<(u32, usize)> = None;
|
||||||
while let Some(i) = indices.next() {
|
while let Some(i) = indices.next() {
|
||||||
let (s, byte_len) = if let Some(&end) = indices.peek() {
|
let end = indices.peek();
|
||||||
match (i, self.continuing_subword_prefix.as_ref()) {
|
let is_first = i == 0;
|
||||||
(0, Some(prefix)) => (
|
let is_last = end.is_none();
|
||||||
Cow::Owned(format!("{}{}", prefix, &w[i..end])),
|
|
||||||
(i..end).len(),
|
let mut s = if let Some(e) = end {
|
||||||
),
|
Cow::Borrowed(&w[i..*e])
|
||||||
_ => (Cow::Borrowed(&w[i..end]), (i..end).len()),
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
(
|
Cow::Borrowed(&w[i..])
|
||||||
self.end_of_word_suffix
|
|
||||||
.as_ref()
|
|
||||||
.map(|suffix| format!("{}{}", &w[i..], suffix).into())
|
|
||||||
.unwrap_or_else(|| Cow::Borrowed(&w[i..])),
|
|
||||||
w[i..].len(),
|
|
||||||
)
|
|
||||||
};
|
};
|
||||||
|
let byte_len = s.len();
|
||||||
|
|
||||||
|
// Add the `continuing_subword_prefix` if relevant
|
||||||
|
if !is_first {
|
||||||
|
if let Some(ref prefix) = self.continuing_subword_prefix {
|
||||||
|
s = format!("{}{}", prefix, s).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Add the `end_of_word_suffix` if relevant
|
||||||
|
if is_last {
|
||||||
|
if let Some(ref suffix) = self.end_of_word_suffix {
|
||||||
|
s = format!("{}{}", s, suffix).into()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(id) = self.vocab.get(s.as_ref()) {
|
if let Some(id) = self.vocab.get(s.as_ref()) {
|
||||||
if let Some((unk_id, unk_len)) = unk {
|
if let Some((unk_id, unk_len)) = unk {
|
||||||
@@ -684,17 +690,41 @@ mod tests {
|
|||||||
("##b".to_string(), 1),
|
("##b".to_string(), 1),
|
||||||
("##c".to_string(), 2),
|
("##c".to_string(), 2),
|
||||||
("ab".to_string(), 3),
|
("ab".to_string(), 3),
|
||||||
|
("abc".to_string(), 4),
|
||||||
]
|
]
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let merges = vec![("a".to_string(), "##b".to_string())];
|
let merges = vec![
|
||||||
|
("a".to_string(), "##b".to_string()),
|
||||||
|
("ab".to_string(), "##c".to_string()),
|
||||||
|
];
|
||||||
|
|
||||||
BPE::builder()
|
let bpe = BPE::builder()
|
||||||
.vocab_and_merges(vocab, merges)
|
.vocab_and_merges(vocab, merges)
|
||||||
|
.unk_token("[UNK]".to_string())
|
||||||
.continuing_subword_prefix("##".to_string())
|
.continuing_subword_prefix("##".to_string())
|
||||||
.build()
|
.build()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
let res = bpe.tokenize("ab");
|
||||||
|
assert_eq!(
|
||||||
|
res.unwrap(),
|
||||||
|
vec![Token {
|
||||||
|
id: 3,
|
||||||
|
value: "ab".to_string(),
|
||||||
|
offsets: (0, 2)
|
||||||
|
}]
|
||||||
|
);
|
||||||
|
let res = bpe.tokenize("abc");
|
||||||
|
assert_eq!(
|
||||||
|
res.unwrap(),
|
||||||
|
vec![Token {
|
||||||
|
id: 4,
|
||||||
|
value: "abc".to_string(),
|
||||||
|
offsets: (0, 3)
|
||||||
|
}]
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user