mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Add clippy warnings + fix all of them
This commit is contained in:
@ -1,3 +1,5 @@
|
|||||||
|
#![warn(clippy::all)]
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate lazy_static;
|
extern crate lazy_static;
|
||||||
|
|
||||||
|
@ -10,6 +10,7 @@ use std::sync::Mutex;
|
|||||||
/// The goal is clearly not the accuracy of the content, both get and set
|
/// The goal is clearly not the accuracy of the content, both get and set
|
||||||
/// are not guaranteed to actually get or set.
|
/// are not guaranteed to actually get or set.
|
||||||
///
|
///
|
||||||
|
#[derive(Default)]
|
||||||
pub struct Cache<K, V>
|
pub struct Cache<K, V>
|
||||||
where
|
where
|
||||||
K: Eq + Hash + Clone,
|
K: Eq + Hash + Clone,
|
||||||
@ -32,9 +33,7 @@ where
|
|||||||
pub fn get_values(&self, keys: &[K]) -> Vec<Option<V>> {
|
pub fn get_values(&self, keys: &[K]) -> Vec<Option<V>> {
|
||||||
let mut lock = self.map.try_lock();
|
let mut lock = self.map.try_lock();
|
||||||
if let Ok(ref mut cache) = lock {
|
if let Ok(ref mut cache) = lock {
|
||||||
keys.iter()
|
keys.iter().map(|k| cache.get(k).cloned()).collect()
|
||||||
.map(|k| cache.get(k).map(|v| v.clone()))
|
|
||||||
.collect()
|
|
||||||
} else {
|
} else {
|
||||||
keys.iter().map(|_| None).collect()
|
keys.iter().map(|_| None).collect()
|
||||||
}
|
}
|
||||||
|
@ -69,14 +69,14 @@ impl BPE {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let parts = line.split(" ").collect::<Vec<_>>();
|
let parts = line.split(' ').collect::<Vec<_>>();
|
||||||
|
|
||||||
let a = vocab
|
let a = vocab
|
||||||
.get(parts[0])
|
.get(parts[0])
|
||||||
.ok_or(Error::MergeTokenOutOfVocabulary(parts[0].to_owned()))?;
|
.ok_or_else(|| Error::MergeTokenOutOfVocabulary(parts[0].to_owned()))?;
|
||||||
let b = vocab
|
let b = vocab
|
||||||
.get(parts[1])
|
.get(parts[1])
|
||||||
.ok_or(Error::MergeTokenOutOfVocabulary(parts[1].to_owned()))?;
|
.ok_or_else(|| Error::MergeTokenOutOfVocabulary(parts[1].to_owned()))?;
|
||||||
let pair = (*a, *b);
|
let pair = (*a, *b);
|
||||||
let new_token = format!("{}{}", parts[0], parts[1]);
|
let new_token = format!("{}{}", parts[0], parts[1]);
|
||||||
let new_id = vocab
|
let new_id = vocab
|
||||||
@ -101,7 +101,7 @@ impl Model for BPE {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn tokenize(&self, sentence: Vec<String>) -> Result<Vec<Token>> {
|
fn tokenize(&self, sentence: Vec<String>) -> Result<Vec<Token>> {
|
||||||
if sentence.len() == 0 {
|
if sentence.is_empty() {
|
||||||
return Ok(vec![]);
|
return Ok(vec![]);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,7 +109,7 @@ impl Model for BPE {
|
|||||||
let mut cached_words = self.cache.get_values(&sentence);
|
let mut cached_words = self.cache.get_values(&sentence);
|
||||||
|
|
||||||
for (i, w) in sentence.iter().enumerate() {
|
for (i, w) in sentence.iter().enumerate() {
|
||||||
if let None = cached_words[i] {
|
if cached_words[i].is_none() {
|
||||||
let mut word = Word::new();
|
let mut word = Word::new();
|
||||||
for c in w.chars() {
|
for c in w.chars() {
|
||||||
match self.vocab.get(&c.to_string()) {
|
match self.vocab.get(&c.to_string()) {
|
||||||
@ -194,10 +194,10 @@ impl Model for BPE {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||||
self.vocab.get(token).map(|id| *id)
|
self.vocab.get(token).copied()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn id_to_token(&self, id: u32) -> Option<String> {
|
fn id_to_token(&self, id: u32) -> Option<String> {
|
||||||
self.vocab_r.get(&id).map(|token| token.clone())
|
self.vocab_r.get(&id).cloned()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,8 @@
|
|||||||
//!
|
//!
|
||||||
//! In charge of training a BPE model
|
//! In charge of training a BPE model
|
||||||
//!
|
//!
|
||||||
|
#![allow(clippy::map_entry)]
|
||||||
|
|
||||||
use super::{Pair, Word, BPE};
|
use super::{Pair, Word, BPE};
|
||||||
use crate::tokenizer::{Model, Result, Trainer};
|
use crate::tokenizer::{Model, Result, Trainer};
|
||||||
use std::{
|
use std::{
|
||||||
@ -87,7 +89,7 @@ impl Trainer for BpeTrainer {
|
|||||||
|
|
||||||
// Initialize pair_counts and where_to_update for this pair if we just saw it
|
// Initialize pair_counts and where_to_update for this pair if we just saw it
|
||||||
if !pair_counts.contains_key(&cur_pair) {
|
if !pair_counts.contains_key(&cur_pair) {
|
||||||
let pair = (0, cur_pair.clone());
|
let pair = (0, cur_pair);
|
||||||
pair_counts.insert(cur_pair, pair);
|
pair_counts.insert(cur_pair, pair);
|
||||||
if !where_to_update.contains_key(&cur_pair) {
|
if !where_to_update.contains_key(&cur_pair) {
|
||||||
where_to_update.insert(cur_pair, HashSet::new());
|
where_to_update.insert(cur_pair, HashSet::new());
|
||||||
@ -125,7 +127,7 @@ impl Trainer for BpeTrainer {
|
|||||||
// Find the best pair
|
// Find the best pair
|
||||||
let mut best_count = 0;
|
let mut best_count = 0;
|
||||||
let mut best_pair = (std::u32::MAX, std::u32::MAX);
|
let mut best_pair = (std::u32::MAX, std::u32::MAX);
|
||||||
for (_, x) in &pair_counts {
|
for x in pair_counts.values() {
|
||||||
if x.0 > best_count {
|
if x.0 > best_count {
|
||||||
best_count = x.0;
|
best_count = x.0;
|
||||||
best_pair = x.1;
|
best_pair = x.1;
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
use super::Pair;
|
use super::Pair;
|
||||||
|
|
||||||
// TODO: Add tests
|
// TODO: Add tests
|
||||||
#[derive(Clone)]
|
#[derive(Clone, Default)]
|
||||||
pub struct Word {
|
pub struct Word {
|
||||||
chars: Vec<u32>,
|
chars: Vec<u32>,
|
||||||
sizes: Vec<usize>,
|
sizes: Vec<usize>,
|
||||||
|
@ -151,11 +151,11 @@ impl Model for WordPiece {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||||
self.vocab.get(token).map(|id| *id)
|
self.vocab.get(token).copied()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn id_to_token(&self, id: u32) -> Option<String> {
|
fn id_to_token(&self, id: u32) -> Option<String> {
|
||||||
self.vocab_r.get(&id).map(|token| token.clone())
|
self.vocab_r.get(&id).cloned()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -14,7 +14,7 @@ impl PreTokenizer for Whitespace {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|m| {
|
.map(|m| {
|
||||||
m.map(|capture| s[capture.start()..capture.end()].to_owned())
|
m.map(|capture| s[capture.start()..capture.end()].to_owned())
|
||||||
.unwrap_or(String::from(""))
|
.unwrap_or_else(|| String::from(""))
|
||||||
})
|
})
|
||||||
.collect()
|
.collect()
|
||||||
})
|
})
|
||||||
|
@ -83,13 +83,13 @@ impl PostProcessor for BertProcessing {
|
|||||||
.map(|e| e.get_normalized())
|
.map(|e| e.get_normalized())
|
||||||
.unwrap_or("")
|
.unwrap_or("")
|
||||||
),
|
),
|
||||||
[&ids[..], &pair_ids.unwrap_or(vec![])[..]].concat(),
|
[&ids[..], &pair_ids.unwrap_or_else(|| vec![])[..]].concat(),
|
||||||
[&type_ids[..], &pair_type_ids.unwrap_or(vec![])[..]].concat(),
|
[&type_ids[..], &pair_type_ids.unwrap_or_else(|| vec![])[..]].concat(),
|
||||||
[&tokens[..], &pair_tokens.unwrap_or(vec![])[..]].concat(),
|
[&tokens[..], &pair_tokens.unwrap_or_else(|| vec![])[..]].concat(),
|
||||||
[&offsets[..], &pair_offsets.unwrap_or(vec![])[..]].concat(),
|
[&offsets[..], &pair_offsets.unwrap_or_else(|| vec![])[..]].concat(),
|
||||||
[
|
[
|
||||||
&special_tokens[..],
|
&special_tokens[..],
|
||||||
&pair_special_tokens.unwrap_or(vec![])[..],
|
&pair_special_tokens.unwrap_or_else(|| vec![])[..],
|
||||||
]
|
]
|
||||||
.concat(),
|
.concat(),
|
||||||
attention_mask,
|
attention_mask,
|
||||||
|
@ -12,6 +12,7 @@ pub struct Encoding {
|
|||||||
overflowing: Option<Box<Encoding>>,
|
overflowing: Option<Box<Encoding>>,
|
||||||
}
|
}
|
||||||
impl Encoding {
|
impl Encoding {
|
||||||
|
#[allow(clippy::too_many_arguments)]
|
||||||
pub fn new(
|
pub fn new(
|
||||||
original: String,
|
original: String,
|
||||||
normalized: String,
|
normalized: String,
|
||||||
@ -118,8 +119,8 @@ impl Encoding {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_with(&mut self, pair: Encoding) {
|
pub fn merge_with(&mut self, pair: Encoding) {
|
||||||
self.original.extend(pair.original.chars());
|
self.original.push_str(&pair.original);
|
||||||
self.normalized.extend(pair.normalized.chars());
|
self.normalized.push_str(&pair.normalized);
|
||||||
self.ids.extend(pair.ids);
|
self.ids.extend(pair.ids);
|
||||||
self.type_ids.extend(pair.type_ids);
|
self.type_ids.extend(pair.type_ids);
|
||||||
self.tokens.extend(pair.tokens);
|
self.tokens.extend(pair.tokens);
|
||||||
@ -142,12 +143,12 @@ impl Encoding {
|
|||||||
|
|
||||||
/// Prepend the `stride` last elements of the `previous` Vec to the current Vec
|
/// Prepend the `stride` last elements of the `previous` Vec to the current Vec
|
||||||
// A new Vec is instantiated though.
|
// A new Vec is instantiated though.
|
||||||
fn prepend_stride<T: Clone>(previous: &Vec<T>, current: Vec<T>, stride: usize) -> Vec<T> {
|
fn prepend_stride<T: Clone>(previous: &[T], current: Vec<T>, stride: usize) -> Vec<T> {
|
||||||
let prev = previous
|
let prev = previous
|
||||||
.iter()
|
.iter()
|
||||||
.rev()
|
.rev()
|
||||||
.take(stride)
|
.take(stride)
|
||||||
.map(|v| v.clone())
|
.cloned()
|
||||||
.rev()
|
.rev()
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
@ -236,7 +236,7 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Train a model and replace our current Model, using the given Trainer
|
/// Train a model and replace our current Model, using the given Trainer
|
||||||
pub fn train(&mut self, trainer: &Box<dyn Trainer>, files: Vec<String>) -> Result<()> {
|
pub fn train(&mut self, trainer: &dyn Trainer, files: Vec<String>) -> Result<()> {
|
||||||
let results = files
|
let results = files
|
||||||
.par_iter()
|
.par_iter()
|
||||||
.map(|file| -> Result<HashMap<String, u32>> {
|
.map(|file| -> Result<HashMap<String, u32>> {
|
||||||
@ -284,7 +284,7 @@ impl Tokenizer {
|
|||||||
if let Some(normalizer) = &self.normalizer {
|
if let Some(normalizer) = &self.normalizer {
|
||||||
normalizer.normalize(sentence)
|
normalizer.normalize(sentence)
|
||||||
} else {
|
} else {
|
||||||
Ok(sentence.to_owned())
|
Ok(sentence)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -52,9 +52,9 @@ pub fn truncate_encodings(
|
|||||||
}
|
}
|
||||||
|
|
||||||
encoding.truncate(encoding.get_ids().len() - n_first, stride);
|
encoding.truncate(encoding.get_ids().len() - n_first, stride);
|
||||||
pair_encoding
|
if let Some(encoding) = pair_encoding.as_mut() {
|
||||||
.as_mut()
|
encoding.truncate(encoding.get_ids().len() - n_second, stride);
|
||||||
.map(|encoding| encoding.truncate(encoding.get_ids().len() - n_second, stride));
|
}
|
||||||
}
|
}
|
||||||
TruncationStrategy::OnlyFirst | TruncationStrategy::OnlySecond => {
|
TruncationStrategy::OnlyFirst | TruncationStrategy::OnlySecond => {
|
||||||
let target = if strategy == TruncationStrategy::OnlyFirst {
|
let target = if strategy == TruncationStrategy::OnlyFirst {
|
||||||
|
Reference in New Issue
Block a user