mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-01 23:09:34 +00:00
Cleans up a few pattern-matches into their Option/Result equivalent
This commit is contained in:
committed by
Anthony MOI
parent
cca5d43038
commit
7bee825238
@ -439,10 +439,7 @@ impl Model for Unigram {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn id_to_token(&self, id: u32) -> Option<String> {
|
fn id_to_token(&self, id: u32) -> Option<String> {
|
||||||
match self.vocab.get(id as usize) {
|
self.vocab.get(id as usize).map(|item| item.0.clone())
|
||||||
Some(item) => Some(item.0.clone()),
|
|
||||||
None => None,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
||||||
|
@ -247,14 +247,13 @@ impl Encoding {
|
|||||||
/// Get the offsets of the word at the given index in the input sequence.
|
/// Get the offsets of the word at the given index in the input sequence.
|
||||||
pub fn word_to_chars(&self, word: u32, sequence_id: usize) -> Option<Offsets> {
|
pub fn word_to_chars(&self, word: u32, sequence_id: usize) -> Option<Offsets> {
|
||||||
self.word_to_tokens(word, sequence_id)
|
self.word_to_tokens(word, sequence_id)
|
||||||
.map(|(start, end)| {
|
.and_then(|(start, end)| {
|
||||||
if end == 0 {
|
if end == 0 {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
Some((self.offsets[start].0, self.offsets[end - 1].1))
|
Some((self.offsets[start].0, self.offsets[end - 1].1))
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.flatten()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the offsets of the token at the given index.
|
/// Get the offsets of the token at the given index.
|
||||||
@ -288,8 +287,7 @@ impl Encoding {
|
|||||||
pub fn char_to_word(&self, pos: usize, sequence_id: usize) -> Option<u32> {
|
pub fn char_to_word(&self, pos: usize, sequence_id: usize) -> Option<u32> {
|
||||||
Some(
|
Some(
|
||||||
self.char_to_token(pos, sequence_id)
|
self.char_to_token(pos, sequence_id)
|
||||||
.map(|token| self.token_to_word(token))
|
.and_then(|token| self.token_to_word(token))?
|
||||||
.flatten()?
|
|
||||||
.1,
|
.1,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -694,10 +694,9 @@ where
|
|||||||
|
|
||||||
// Encode each sequence
|
// Encode each sequence
|
||||||
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::Byte)?;
|
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::Byte)?;
|
||||||
let pair_encoding = match pair {
|
let pair_encoding = pair
|
||||||
Some(sequence) => Some(self.encode_single_sequence(sequence, 1, OffsetType::Byte)?),
|
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::Byte))
|
||||||
None => None,
|
.transpose()?;
|
||||||
};
|
|
||||||
|
|
||||||
// And finally post process
|
// And finally post process
|
||||||
self.post_process(encoding, pair_encoding, add_special_tokens)
|
self.post_process(encoding, pair_encoding, add_special_tokens)
|
||||||
@ -738,10 +737,9 @@ where
|
|||||||
|
|
||||||
// Encode each sequence
|
// Encode each sequence
|
||||||
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::Char)?;
|
let encoding = self.encode_single_sequence(sequence, 0, OffsetType::Char)?;
|
||||||
let pair_encoding = match pair {
|
let pair_encoding = pair
|
||||||
Some(sequence) => Some(self.encode_single_sequence(sequence, 1, OffsetType::Char)?),
|
.map(|sequence| self.encode_single_sequence(sequence, 1, OffsetType::Char))
|
||||||
None => None,
|
.transpose()?;
|
||||||
};
|
|
||||||
|
|
||||||
// And finally post process
|
// And finally post process
|
||||||
self.post_process(encoding, pair_encoding, add_special_tokens)
|
self.post_process(encoding, pair_encoding, add_special_tokens)
|
||||||
|
Reference in New Issue
Block a user