mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Fix BPE saving (u32 => String)
This commit is contained in:
@ -276,16 +276,18 @@ impl Model for BPE {
|
|||||||
.iter()
|
.iter()
|
||||||
.collect();
|
.collect();
|
||||||
let mut merges_file = File::create(&merges_path)?;
|
let mut merges_file = File::create(&merges_path)?;
|
||||||
let mut merges: Vec<(Pair, u32)> = self
|
let mut merges: Vec<(&Pair, &u32)> = self
|
||||||
.merges
|
.merges
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(pair, (rank, _))| (*pair, *rank))
|
.map(|(pair, (rank, _))| (pair, rank))
|
||||||
.collect();
|
.collect();
|
||||||
merges.sort_unstable_by_key(|k| k.1);
|
merges.sort_unstable_by_key(|k| *k.1);
|
||||||
merges_file.write_all(
|
merges_file.write_all(
|
||||||
&merges
|
&merges
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(pair, _)| format!("{} {}\n", pair.0, pair.1).into_bytes())
|
.map(|(pair, _)| {
|
||||||
|
format!("{} {}\n", self.vocab_r[&pair.0], self.vocab_r[&pair.1]).into_bytes()
|
||||||
|
})
|
||||||
.flatten()
|
.flatten()
|
||||||
.collect::<Vec<_>>()[..],
|
.collect::<Vec<_>>()[..],
|
||||||
)?;
|
)?;
|
||||||
|
Reference in New Issue
Block a user