mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
17
.github/stale.yml
vendored
Normal file
17
.github/stale.yml
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
# Number of days of inactivity before an issue becomes stale
|
||||||
|
daysUntilStale: 60
|
||||||
|
# Number of days of inactivity before a stale issue is closed
|
||||||
|
daysUntilClose: 7
|
||||||
|
# Issues with these labels will never be considered stale
|
||||||
|
exemptLabels:
|
||||||
|
- pinned
|
||||||
|
- security
|
||||||
|
# Label to use when marking an issue as stale
|
||||||
|
staleLabel: wontfix
|
||||||
|
# Comment to post when marking an issue as stale. Set to `false` to disable
|
||||||
|
markComment: >
|
||||||
|
This issue has been automatically marked as stale because it has not had
|
||||||
|
recent activity. It will be closed if no further activity occurs. Thank you
|
||||||
|
for your contributions.
|
||||||
|
# Comment to post when closing a stale issue. Set to `false` to disable
|
||||||
|
closeComment: false
|
@ -63,7 +63,7 @@ impl<'de> Visitor<'de> for UnigramVisitor {
|
|||||||
}
|
}
|
||||||
match (vocab, unk_id) {
|
match (vocab, unk_id) {
|
||||||
(Some(vocab), unk_id) => Ok(Unigram::from(vocab, unk_id)
|
(Some(vocab), unk_id) => Ok(Unigram::from(vocab, unk_id)
|
||||||
.map_err(|err| Error::custom(&format!("Unable to load vocab {:?}", err)))?),
|
.map_err(|err| Error::custom(format!("Unable to load vocab {:?}", err)))?),
|
||||||
(None, _) => Err(Error::custom("Missing vocab")),
|
(None, _) => Err(Error::custom("Missing vocab")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -501,7 +501,7 @@ impl UnigramTrainer {
|
|||||||
let expected_loops = (((desired_vocab_size as f64).ln() - (pieces.len() as f64).ln())
|
let expected_loops = (((desired_vocab_size as f64).ln() - (pieces.len() as f64).ln())
|
||||||
/ self.shrinking_factor.ln()) as usize
|
/ self.shrinking_factor.ln()) as usize
|
||||||
+ 1;
|
+ 1;
|
||||||
let expected_updates = expected_loops as usize * self.n_sub_iterations as usize;
|
let expected_updates = expected_loops * self.n_sub_iterations as usize;
|
||||||
self.update_progress(&progress, expected_updates, "EM training");
|
self.update_progress(&progress, expected_updates, "EM training");
|
||||||
let required_chars = self.required_chars(&sentences);
|
let required_chars = self.required_chars(&sentences);
|
||||||
let mut new_model = Unigram::from(pieces.clone(), Some(0))?;
|
let mut new_model = Unigram::from(pieces.clone(), Some(0))?;
|
||||||
|
@ -487,7 +487,7 @@ impl TemplateProcessing {
|
|||||||
.flat_map(|piece| {
|
.flat_map(|piece| {
|
||||||
match piece {
|
match piece {
|
||||||
Piece::Sequence { id, type_id } => {
|
Piece::Sequence { id, type_id } => {
|
||||||
let i = if *id == Sequence::A { 0 } else { 1 };
|
let i = usize::from(*id != Sequence::A);
|
||||||
let encoding = &mut encodings[i];
|
let encoding = &mut encodings[i];
|
||||||
encoding.set_type_ids(vec![*type_id; encoding.len()]);
|
encoding.set_type_ids(vec![*type_id; encoding.len()]);
|
||||||
encoding.set_sequence_id(i);
|
encoding.set_sequence_id(i);
|
||||||
|
@ -507,7 +507,7 @@ impl NormalizedString {
|
|||||||
let transformations = s
|
let transformations = s
|
||||||
.chars()
|
.chars()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(i, c)| (c, if i == 0 { 0 } else { 1 }))
|
.map(|(i, c)| (c, isize::from(i != 0)))
|
||||||
.chain(std::iter::once((next, 1)));
|
.chain(std::iter::once((next, 1)));
|
||||||
|
|
||||||
self.transform_range(Range::Normalized(0..next.len_utf8()), transformations, 0);
|
self.transform_range(Range::Normalized(0..next.len_utf8()), transformations, 0);
|
||||||
@ -853,16 +853,8 @@ pub fn get_range_of<T: RangeBounds<usize>>(s: &str, range: T) -> Option<&str> {
|
|||||||
} else if start >= len || end > len || start >= end {
|
} else if start >= len || end > len || start >= end {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
let start_b = s
|
let start_b = s.char_indices().map(|(i, _)| i).nth(start).unwrap_or(0);
|
||||||
.char_indices()
|
let end_b = s.char_indices().map(|(i, _)| i).nth(end).unwrap_or(s.len());
|
||||||
.map(|(i, _)| i)
|
|
||||||
.nth(start as usize)
|
|
||||||
.unwrap_or(0);
|
|
||||||
let end_b = s
|
|
||||||
.char_indices()
|
|
||||||
.map(|(i, _)| i)
|
|
||||||
.nth(end as usize)
|
|
||||||
.unwrap_or(s.len());
|
|
||||||
Some(&s[start_b..end_b])
|
Some(&s[start_b..end_b])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user