mirror of
https://github.com/mii443/esaxx-rs.git
synced 2025-08-22 06:55:37 +00:00
Fixing using char to u32 we are actually using invalid char spaces
in suffixsort function.
This commit is contained in:
@ -4,18 +4,18 @@ use esaxx_rs::{suffix, suffix_rs};
|
||||
fn criterion_benchmark(c: &mut Criterion) {
|
||||
let string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.".to_string();
|
||||
|
||||
c.bench_function("Esaxx C++ short", |b| {
|
||||
c.bench_function("suffix_cpp_short", |b| {
|
||||
b.iter(|| suffix(black_box(&string)).unwrap())
|
||||
});
|
||||
c.bench_function("Esaxx Rust short", |b| {
|
||||
c.bench_function("suffix_rust_short", |b| {
|
||||
b.iter(|| suffix_rs(black_box(&string)).unwrap())
|
||||
});
|
||||
|
||||
let string = std::fs::read_to_string("data/eighty.txt").unwrap();
|
||||
c.bench_function("Esaxx C++ long", |b| {
|
||||
c.bench_function("suffix_cpp_long", |b| {
|
||||
b.iter(|| suffix(black_box(&string)).unwrap())
|
||||
});
|
||||
c.bench_function("Esaxx Rust long", |b| {
|
||||
c.bench_function("suffix_rust_long", |b| {
|
||||
b.iter(|| suffix_rs(black_box(&string)).unwrap())
|
||||
});
|
||||
}
|
18
examples/esaxx.rs
Normal file
18
examples/esaxx.rs
Normal file
@ -0,0 +1,18 @@
|
||||
use esaxx_rs::{suffix, suffix_rs};
|
||||
use std::env::args;
|
||||
use std::fs;
|
||||
|
||||
fn main() {
|
||||
// Prints each argument on a separate line
|
||||
let args: Vec<_> = args().skip(1).collect();
|
||||
let version = &args[0];
|
||||
let filename = &args[1];
|
||||
|
||||
let string = fs::read_to_string(filename).unwrap();
|
||||
let (count, version) = if version == "rust" {
|
||||
(suffix_rs(&string).unwrap().iter().count(), "Rust")
|
||||
} else {
|
||||
(suffix(&string).unwrap().iter().count(), "Cpp")
|
||||
};
|
||||
println!("Used {} version ! Found {} nodes", version, count);
|
||||
}
|
13
src/lib.rs
13
src/lib.rs
@ -110,7 +110,14 @@ pub fn suffix_rs(string: &str) -> Result<Suffix<usize>, SuffixError> {
|
||||
let mut r = vec![0; n];
|
||||
let mut d = vec![0; n];
|
||||
let alphabet_size = 0x110000; // All UCS4 range.
|
||||
let node_num = esaxx_rs(&chars, &mut sa, &mut l, &mut r, &mut d, alphabet_size)?;
|
||||
let node_num = esaxx_rs(
|
||||
&chars.iter().map(|c| *c as u32).collect::<Vec<_>>(),
|
||||
&mut sa,
|
||||
&mut l,
|
||||
&mut r,
|
||||
&mut d,
|
||||
alphabet_size,
|
||||
)?;
|
||||
Ok(Suffix {
|
||||
chars,
|
||||
sa,
|
||||
@ -277,7 +284,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_esaxx_rs() {
|
||||
let string = "abracadabra".to_string();
|
||||
let chars: Vec<_> = string.chars().collect();
|
||||
let chars: Vec<_> = string.chars().map(|c| c as u32).collect();
|
||||
let n = chars.len();
|
||||
let mut sa = vec![0; n];
|
||||
let mut l = vec![0; n];
|
||||
@ -301,7 +308,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_esaxx_rs_long() {
|
||||
let string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.".to_string();
|
||||
let chars: Vec<_> = string.chars().collect();
|
||||
let chars: Vec<_> = string.chars().map(|c| c as u32).collect();
|
||||
let n = chars.len();
|
||||
let mut sa = vec![0; n];
|
||||
let mut l = vec![0; n];
|
||||
|
16
src/sais.rs
16
src/sais.rs
@ -1,11 +1,9 @@
|
||||
use crate::types::{Bucket, SArray, StringT, SuffixError};
|
||||
use std::char;
|
||||
|
||||
fn has_high_bit(j: usize) -> bool {
|
||||
j > usize::MAX / 2
|
||||
}
|
||||
|
||||
// TODO : Parallelize this
|
||||
fn get_counts(t: &StringT, c: &mut Bucket) {
|
||||
for item in c.iter_mut() {
|
||||
*item = 0;
|
||||
@ -305,11 +303,11 @@ fn suffixsort(
|
||||
}
|
||||
// XXX: Could call transmute on SA to avoid allocation.
|
||||
// but it requires unsafe.
|
||||
let ra: Vec<char> = suffix_array
|
||||
let ra: Vec<u32> = suffix_array
|
||||
.iter()
|
||||
.skip(ra_index)
|
||||
.take(m)
|
||||
.map(|n| char::from_u32(*n as u32).unwrap())
|
||||
.map(|n| *n as u32)
|
||||
.collect();
|
||||
suffixsort(&ra, suffix_array, fs + n - m * 2, m, name, false)?;
|
||||
// let ra: &[char] =
|
||||
@ -338,8 +336,6 @@ fn suffixsort(
|
||||
}
|
||||
|
||||
/* stage 3: induce the result for the original problem */
|
||||
let mut counts = vec![0; k];
|
||||
let mut buckets = vec![0; k];
|
||||
/* put all left-most S characters into their buckets */
|
||||
get_counts(string, &mut counts);
|
||||
get_buckets(&counts, &mut buckets, k, true);
|
||||
@ -393,10 +389,10 @@ fn _saisxx_bwt(
|
||||
let mut pidx = suffixsort(t, sa, 0, n, k, true)?;
|
||||
u[0] = t[n - 1];
|
||||
for i in 0..pidx {
|
||||
u[i + 1] = char::from_u32(sa[i] as u32).unwrap(); // cast to char
|
||||
u[i + 1] = sa[i] as u32;
|
||||
}
|
||||
for i in pidx + 1..n {
|
||||
u[i] = char::from_u32(sa[i] as u32).unwrap();
|
||||
u[i] = sa[i] as u32
|
||||
}
|
||||
pidx += 1;
|
||||
Ok(pidx)
|
||||
@ -408,7 +404,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_induce_sa() {
|
||||
let chars: Vec<_> = "abracadabra".chars().collect();
|
||||
let chars: Vec<_> = "abracadabra".chars().map(|c| c as u32).collect();
|
||||
let mut c = vec![0; 256];
|
||||
let mut b = vec![0; 256];
|
||||
|
||||
@ -424,7 +420,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_induce_sa_long() {
|
||||
let string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.".to_string();
|
||||
let chars: Vec<_> = string.chars().collect();
|
||||
let chars: Vec<_> = string.chars().map(|c| c as u32).collect();
|
||||
let mut c = vec![0; 256];
|
||||
let mut b = vec![0; 256];
|
||||
let mut sa = vec![
|
||||
|
@ -1,5 +1,8 @@
|
||||
pub type Bucket = [usize];
|
||||
pub type StringT = [char];
|
||||
/// We need to use u32 instead of char, because when we recurse
|
||||
/// we use suffix array elements as ways to replace our original
|
||||
/// string. Using chars can fail. Look for ra variable.
|
||||
pub type StringT = [u32];
|
||||
pub type SArray = [usize];
|
||||
|
||||
#[derive(Debug)]
|
||||
|
Reference in New Issue
Block a user