Adding a fully rust (safe) version of esaxx.

This commit is contained in:
Nicolas Patry
2020-07-01 14:20:34 +02:00
parent b5ed9aebff
commit 3fea09b6c0
6 changed files with 9198 additions and 29 deletions

8406
data/eighty.txt Normal file

File diff suppressed because it is too large Load Diff

93
src/esa.rs Normal file
View File

@ -0,0 +1,93 @@
use crate::sais::saisxx;
use crate::types::{SArray, StringT, SuffixError};
use std::convert::TryInto;
fn suffixtree(
t: &StringT,
sa: &mut SArray,
l: &mut SArray,
r: &mut SArray,
d: &mut SArray,
n: usize,
) -> usize {
if n == 0 {
return 0;
}
println!("-------------");
println!("sa {:?}", sa);
// Psi = l
l[sa[0]] = sa[n - 1];
for i in 1..n {
l[sa[i]] = sa[i - 1];
}
// Compare at most 2n log n charcters. Practically fastest
// "Permuted Longest-Common-Prefix Array", Juha Karkkainen, CPM 09
// PLCP = r
let mut h = 0;
for i in 0..n {
let j = l[i];
while i + h < n && j + h < n && t[i + h] == t[j + h] {
h += 1;
}
r[i] = h;
if h > 0 {
h -= 1;
}
}
// H = l
for i in 0..n {
l[i] = r[sa[i]];
}
// TODO XXX: i32 necessary
// l[0] = -1;
let mut s: Vec<(i32, i32)> = vec![(-1, -1)];
let mut node_num = 0;
let mut i: usize = 0;
loop {
let mut cur: (i32, i32) = (i as i32, if i == n { -1 } else { l[i] as i32 });
let mut cand = s[s.len() - 1];
while cand.1 > cur.1 {
if (i as i32) - cand.0 > 1 {
l[node_num] = cand.0.try_into().unwrap();
r[node_num] = i;
d[node_num] = cand.1.try_into().unwrap();
node_num += 1;
if node_num >= n {
break;
}
}
cur.0 = cand.0;
s.pop();
cand = s[s.len() - 1];
}
if cand.1 < cur.1 {
s.push(cur);
}
if i == n {
break;
}
s.push((i.try_into().unwrap(), (n - sa[i] + 1).try_into().unwrap()));
i += 1;
}
node_num
}
pub fn esaxx_rs(
t: &StringT,
sa: &mut SArray,
l: &mut SArray,
r: &mut SArray,
d: &mut SArray,
k: usize,
) -> Result<usize, SuffixError> {
let n = t.len();
if k <= 0 {
return Err(SuffixError::InvalidLength);
}
saisxx(t, sa, n, k)?;
let node_num = suffixtree(t, sa, l, r, d, n);
Ok(node_num)
}

View File

@ -13,8 +13,16 @@
//! assert_eq!(iter.next(), Some((&chars[..0], 11))); // ''
//! assert_eq!(iter.next(), None);
//! ```
#![feature(test)]
extern crate test;
use std::convert::TryInto;
mod esa;
mod sais;
mod types;
pub use esa::esaxx_rs;
use types::SuffixError;
extern "C" {
pub fn esaxx_int32(
@ -30,19 +38,6 @@ extern "C" {
) -> i32;
}
#[derive(Debug)]
pub enum Error {
InvalidLength,
Internal,
IntConversion(std::num::TryFromIntError),
}
impl From<std::num::TryFromIntError> for Error {
fn from(err: std::num::TryFromIntError) -> Self {
Self::IntConversion(err)
}
}
pub fn esaxx(
chars: &[char],
sa: &mut [i32],
@ -51,10 +46,10 @@ pub fn esaxx(
d: &mut [i32],
alphabet_size: u32,
mut node_num: &mut u32,
) -> Result<(), Error> {
) -> Result<(), SuffixError> {
let n = chars.len();
if sa.len() != n || l.len() != n || r.len() != n || d.len() != n {
return Err(Error::InvalidLength);
return Err(SuffixError::InvalidLength);
}
unsafe {
let err = esaxx_int32(
@ -68,27 +63,46 @@ pub fn esaxx(
&mut node_num,
);
if err != 0 {
return Err(Error::Internal);
return Err(SuffixError::Internal);
}
}
Ok(())
}
pub struct SuffixIterator<'a> {
pub struct SuffixIterator<'a, T> {
i: usize,
suffix: &'a Suffix,
suffix: &'a Suffix<T>,
}
pub struct Suffix {
pub struct Suffix<T> {
chars: Vec<char>,
sa: Vec<i32>,
l: Vec<i32>,
r: Vec<i32>,
d: Vec<i32>,
sa: Vec<T>,
l: Vec<T>,
r: Vec<T>,
d: Vec<T>,
node_num: usize,
}
pub fn suffix(string: &str) -> Result<Suffix, Error> {
pub fn suffix_rs(string: &str) -> Result<Suffix<usize>, SuffixError> {
let chars: Vec<_> = string.chars().collect();
let n = chars.len();
let mut sa = vec![0; n];
let mut l = vec![0; n];
let mut r = vec![0; n];
let mut d = vec![0; n];
let alphabet_size = 0x110000; // All UCS4 range.
let node_num = esaxx_rs(&chars, &mut sa, &mut l, &mut r, &mut d, alphabet_size)?;
Ok(Suffix {
chars,
sa,
l,
r,
d,
node_num,
})
}
pub fn suffix(string: &str) -> Result<Suffix<i32>, SuffixError> {
let chars: Vec<_> = string.chars().collect();
let n = chars.len();
let mut sa = vec![0; n];
@ -116,13 +130,13 @@ pub fn suffix(string: &str) -> Result<Suffix, Error> {
})
}
impl Suffix {
pub fn iter(&self) -> SuffixIterator<'_> {
impl<T> Suffix<T> {
pub fn iter(&self) -> SuffixIterator<'_, T> {
SuffixIterator { i: 0, suffix: self }
}
}
impl<'a> Iterator for SuffixIterator<'a> {
impl<'a> Iterator for SuffixIterator<'a, i32> {
type Item = (&'a [char], u32);
fn next(&mut self) -> Option<Self::Item> {
@ -142,9 +156,30 @@ impl<'a> Iterator for SuffixIterator<'a> {
}
}
impl<'a> Iterator for SuffixIterator<'a, usize> {
type Item = (&'a [char], u32);
fn next(&mut self) -> Option<Self::Item> {
let index = self.i;
if index == self.suffix.node_num {
None
} else {
let left: usize = self.suffix.l[index];
let offset: usize = self.suffix.sa[left];
let len: usize = self.suffix.d[index];
let freq: u32 = (self.suffix.r[index] - self.suffix.l[index])
.try_into()
.unwrap();
self.i += 1;
Some((&self.suffix.chars[offset..offset + len], freq))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
#[test]
fn test_esaxx() {
@ -175,6 +210,80 @@ mod tests {
assert_eq!(d, vec![4, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0]);
}
#[test]
fn test_esaxx_long() {
let string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.".to_string();
let chars: Vec<_> = string.chars().collect();
let n = chars.len();
let mut sa = vec![0; n];
let mut l = vec![0; n];
let mut r = vec![0; n];
let mut d = vec![0; n];
let mut node_num = 0;
let alphabet_size = 0x110000; // All UCS4 range.
esaxx(
&chars,
&mut sa,
&mut l,
&mut r,
&mut d,
alphabet_size,
&mut node_num,
)
.unwrap();
assert_eq!(chars.len(), 574);
assert_eq!(node_num, 260);
// assert_eq!(sa, vec![10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]);
// assert_eq!(l, vec![1, 0, 5, 9, 0, 0, 3, 0, 0, 0, 2]);
// assert_eq!(r, vec![3, 5, 7, 11, 11, 1, 0, 1, 0, 0, 0]);
// assert_eq!(d, vec![4, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0]);
}
#[test]
fn test_esaxx_rs() {
let string = "abracadabra".to_string();
let chars: Vec<_> = string.chars().collect();
let n = chars.len();
let mut sa = vec![0; n];
let mut l = vec![0; n];
let mut r = vec![0; n];
let mut d = vec![0; n];
let alphabet_size = 0x110000; // All UCS4 range.
let node_num = esaxx_rs(&chars, &mut sa, &mut l, &mut r, &mut d, alphabet_size).unwrap();
println!("Node num {}", node_num);
println!("sa {:?}", sa);
println!("l {:?}", l);
println!("r {:?}", r);
println!("d {:?}", d);
assert_eq!(node_num, 5);
assert_eq!(sa, vec![10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]);
assert_eq!(l, vec![1, 0, 5, 9, 0, 0, 3, 0, 0, 0, 2]);
assert_eq!(r, vec![3, 5, 7, 11, 11, 1, 0, 1, 0, 0, 0]);
assert_eq!(d, vec![4, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0]);
}
#[test]
fn test_esaxx_rs_long() {
let string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.".to_string();
let chars: Vec<_> = string.chars().collect();
let n = chars.len();
let mut sa = vec![0; n];
let mut l = vec![0; n];
let mut r = vec![0; n];
let mut d = vec![0; n];
let alphabet_size = 0x110000; // All UCS4 range.
let node_num = esaxx_rs(&chars, &mut sa, &mut l, &mut r, &mut d, alphabet_size).unwrap();
assert_eq!(chars.len(), 574);
assert_eq!(node_num, 260);
// assert_eq!(sa, vec![10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]);
// assert_eq!(l, vec![1, 0, 5, 9, 0, 0, 3, 0, 0, 0, 2]);
// assert_eq!(r, vec![3, 5, 7, 11, 11, 1, 0, 1, 0, 0, 0]);
// assert_eq!(d, vec![4, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0]);
}
#[test]
fn test_suffix() {
let suffix = suffix("abracadabra").unwrap();
@ -193,4 +302,47 @@ mod tests {
assert_eq!(iter.next(), Some((&chars[..0], 11))); // ''
assert_eq!(iter.next(), None);
}
#[test]
fn test_suffix_rs() {
let suffix = suffix_rs("abracadabra").unwrap();
assert_eq!(suffix.node_num, 5);
assert_eq!(suffix.sa, vec![10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]);
assert_eq!(suffix.l, vec![1, 0, 5, 9, 0, 0, 3, 0, 0, 0, 2]);
assert_eq!(suffix.r, vec![3, 5, 7, 11, 11, 1, 0, 1, 0, 0, 0]);
assert_eq!(suffix.d, vec![4, 1, 3, 2, 0, 0, 0, 0, 0, 0, 0]);
let mut iter = suffix.iter();
let chars: Vec<_> = "abracadabra".chars().collect();
assert_eq!(iter.next(), Some((&chars[..4], 2))); // abra
assert_eq!(iter.next(), Some((&chars[..1], 5))); // a
assert_eq!(iter.next(), Some((&chars[1..4], 2))); // bra
assert_eq!(iter.next(), Some((&chars[2..4], 2))); // ra
assert_eq!(iter.next(), Some((&chars[..0], 11))); // ''
assert_eq!(iter.next(), None);
}
#[bench]
fn bench_esaxx_long(b: &mut Bencher) {
let string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.".to_string();
b.iter(|| suffix(&string).unwrap());
}
#[bench]
fn bench_esaxx_long2(b: &mut Bencher) {
let string = std::fs::read_to_string("data/eighty.txt").unwrap();
b.iter(|| suffix(&string).unwrap());
}
#[bench]
fn bench_esaxx_rs_long(b: &mut Bencher) {
let string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.".to_string();
b.iter(|| suffix_rs(&string).unwrap());
}
#[bench]
fn bench_esaxx_rs_long2(b: &mut Bencher) {
let string = std::fs::read_to_string("data/eighty.txt").unwrap();
b.iter(|| suffix_rs(&string).unwrap());
}
}

View File

@ -38,6 +38,12 @@
#ifdef _OPENMP
# include <omp.h>
#endif
#include <iostream>
int32_t f(const int32_t s){
int a = ((s < 0)?-~s:s);
return a;
}
namespace saisxx_private {
@ -92,12 +98,15 @@ typedef typename std::iterator_traits<string_type>::value_type char_type;
index_type i, j;
char_type c0, c1;
/* compute SAl */
if(C == B) { getCounts(T, C, n, k); }
if(C == B) {
getCounts(T, C, n, k); }
getBuckets(C, B, k, false); /* find starts of buckets */
b = SA + B[c1 = T[j = n - 1]];
*b++ = ((0 < j) && (T[j - 1] < c1)) ? ~j : j;
for(i = 0; i < n; ++i) {
j = SA[i], SA[i] = ~j;
if(0 < j) {
if((c0 = T[--j]) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; }
*b++ = ((0 < j) && (T[j - 1] < c1)) ? ~j : j;
@ -106,6 +115,7 @@ typedef typename std::iterator_traits<string_type>::value_type char_type;
/* compute SAs */
if(C == B) { getCounts(T, C, n, k); }
getBuckets(C, B, k, true); /* find ends of buckets */
for(i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) {
if(0 < (j = SA[i])) {
if((c0 = T[--j]) != c1) { B[c1] = b - SA; b = SA + B[c1 = c0]; }
@ -225,7 +235,8 @@ typedef typename std::iterator_traits<string_type>::value_type char_type;
p = SA[i];
if((0 < p) && (T[p - 1] > (c0 = T[p]))) {
for(j = p + 1; (j < n) && (c0 == (c1 = T[j])); ++j) { }
if((j < n) && (c0 < c1)) { SA[m++] = p; }
if((j < n) && (c0 < c1)) {
SA[m++] = p; }
}
}
#endif

491
src/sais.rs Normal file
View File

@ -0,0 +1,491 @@
use crate::types::{Bucket, SArray, StringT, SuffixError};
use std::char;
fn has_high_bit(j: usize) -> bool {
j > usize::MAX / 2
}
// TODO : Parallelize this
fn get_counts(t: &StringT, c: &mut Bucket, n: usize, k: usize) {
for i in 0..k {
c[i] = 0;
}
for i in 0..n {
c[t[i] as usize] += 1;
}
}
fn get_buckets(c: &Bucket, b: &mut Bucket, k: usize, end: bool) {
let mut sum = 0;
if end {
for i in 0..k {
sum += c[i];
b[i] = sum;
}
} else {
for i in 0..k {
b[i] = sum;
sum += c[i];
}
}
}
fn induce_sa(t: &StringT, sa: &mut SArray, c: &mut Bucket, b: &mut Bucket, n: usize, k: usize) {
assert!(n <= sa.len());
get_counts(t, c, n, k);
get_buckets(c, b, k, false);
let mut c0;
let mut j = n - 1;
let mut c1 = t[j] as usize;
let mut index = b[c1];
sa[index] = if j > 0 && (t[j - 1] as usize) < c1 {
!j
} else {
j
};
index += 1;
for i in 0..n {
j = sa[i];
sa[i] = !j;
if !has_high_bit(j) && j > 0 {
j -= 1;
c0 = t[j] as usize;
if c0 != c1 {
b[c1] = index;
c1 = c0;
index = b[c1];
}
sa[index] = if j > 0 && !has_high_bit(j) && (t[j - 1] as usize) < c1 {
!j
} else {
j
};
index += 1;
}
}
// Compute SA
// XXX: true here.
get_counts(t, c, n, k);
get_buckets(c, b, k, true);
c1 = 0;
index = b[c1];
for i in (0..n).rev() {
j = sa[i];
if j > 0 && !has_high_bit(j) {
j -= 1;
c0 = t[j] as usize;
if c0 != c1 {
b[c1] = index;
c1 = c0;
index = b[c1];
}
index -= 1;
sa[index] = if j == 0 || (t[j - 1] as usize) > c1 {
!j
} else {
j
};
} else {
sa[i] = !j;
}
}
}
fn compute_bwt(
t: &StringT,
sa: &mut SArray,
c: &mut Bucket,
b: &mut Bucket,
n: usize,
k: usize,
) -> usize {
// TODO
let mut pidx = 0;
get_counts(t, c, n, k);
get_buckets(c, b, k, false);
let mut j = n - 1;
let mut c1 = t[j] as usize;
let mut c0;
let mut index = b[c1];
// bb = SA + B[c1 = T[j = n - 1]];
// *bb++ = ((0 < j) && (T[j - 1] < c1)) ? ~j : j;
sa[index] = if j > 0 && (t[j - 1] as usize) < c1 {
!j
} else {
j
};
index += 1;
for i in 0..n {
j = sa[i];
if j > 0 {
j -= 1;
c0 = t[j] as usize;
sa[i] = !c0;
if c0 != c1 {
b[c1] = index;
c1 = c0;
index = b[c1];
}
sa[index] = if j > 0 && (t[j - 1] as usize) < c1 {
!j
} else {
j
};
index += 1;
} else if j != 0 {
sa[i] = !j;
}
}
// Compute SA
get_counts(t, c, n, k);
get_buckets(c, b, k, true);
c1 = 0;
index = b[c1];
for i in (0..n).rev() {
j = sa[i];
if j > 0 {
j -= 1;
c0 = t[j] as usize;
sa[i] = c0;
if c0 != c1 {
b[c1] = index;
c1 = c0;
index = b[c1];
}
index -= 1;
sa[index] = if j > 0 && (t[j - 1] as usize) > c1 {
!(t[j - 1] as usize)
} else {
j
};
} else if j != 0 {
sa[i] = !j;
} else {
pidx = i
}
}
pidx
}
fn suffixsort(
t: &StringT,
mut sa: &mut SArray,
fs: usize,
n: usize,
k: usize,
is_bwt: bool,
) -> Result<usize, SuffixError> {
let mut pidx = 0;
let mut c0;
let mut c1;
let mut c = vec![0; k];
let mut b = vec![0; k];
get_counts(t, &mut c, n, k);
get_buckets(&mut c, &mut b, k, true);
// stage 1:
// reduce the problem by at least 1/2
// sort all the S-substrings
for i in 0..n {
sa[i] = 0;
}
let mut c_index = 0;
c1 = t[n - 1] as usize;
for i in (0..n - 1).rev() {
c0 = t[i] as usize;
if c0 < c1 + c_index {
c_index = 1;
} else if c_index != 0 {
b[c1] -= 1;
sa[b[c1]] = i + 1;
c_index = 0;
}
c1 = c0;
}
induce_sa(t, &mut sa, &mut c, &mut b, n, k);
// compact all the sorted substrings into the first m items of SA
// 2*m must be not larger than n (proveable)
// TODO: This was in the parallel loop.
let mut p;
let mut j;
let mut m = 0;
for i in 0..n {
p = sa[i];
c0 = t[p] as usize;
if p > 0 && (t[p - 1] as usize) > c0 {
// TODO overly complex. But fricking hard to get right.
j = p + 1;
if j < n {
c1 = t[j] as usize;
}
while j < n && c0 == c1 {
j += 1;
c1 = t[j] as usize;
}
if j < n && c0 < c1 {
sa[m] = p;
m += 1;
}
}
}
j = m + (n >> 1);
for i in m..j {
sa[i] = 0;
}
/* store the length of all substrings */
j = n;
let mut c_index = 0;
c1 = t[n - 1] as usize;
for i in (0..n - 1).rev() {
c0 = t[i] as usize;
if c0 < c1 + c_index {
c_index = 1;
} else if c_index != 0 {
sa[m + ((i + 1) >> 1)] = j - i - 1;
j = i + 1;
c_index = 0;
}
c1 = c0;
}
/* find the lexicographic names of all substrings */
let mut name = 0;
let mut q = n;
let mut qlen = 0;
let mut plen;
let mut diff;
for i in 0..m {
p = sa[i];
plen = sa[m + (p >> 1)];
diff = true;
if plen == qlen {
j = 0;
while j < plen && t[p + j] == t[q + j] {
j += 1;
}
if j == plen {
diff = false;
}
}
if diff {
name += 1;
q = p;
qlen = plen;
}
sa[m + (p >> 1)] = name;
}
/* stage 2: solve the reduced problem
recurse if names are not yet unique */
if name < m {
let ra_index = n + fs - m;
j = m - 1;
let a = m + (n >> 1);
for i in (m..a).rev() {
if sa[i] != 0 {
sa[ra_index + j] = sa[i] - 1;
// XXX: Bug underflow caught by Rust yeah (well cpp used i32)
if j > 0 {
j -= 1;
}
}
}
// XXX: Could call transmute on SA to avoid allocation.
// but it requires unsafe.
let ra: Vec<char> = sa
.iter()
.skip(ra_index)
.take(m)
.map(|n| char::from_u32(*n as u32).unwrap())
.collect();
suffixsort(&ra, sa, fs + n - m * 2, m, name, false)?;
// let ra: &[char] =
// unsafe { std::mem::transmute::<&[usize], &[char]>(&sa[ra_index..ra_index + m]) };
// suffixsort(ra, sa, fs + n - m * 2, m, name, false)?;
j = m - 1;
c_index = 0;
c1 = t[n - 1] as usize;
for i in (0..n - 1).rev() {
c0 = t[i] as usize;
if c0 < c1 + c_index {
c_index = 1;
} else if c_index != 0 {
sa[ra_index + j] = i + 1;
c_index = 0;
if j > 0 {
j -= 1; /* get p1 */
}
}
c1 = c0;
}
// get index in s
for i in 0..m {
sa[i] = sa[ra_index + sa[i]];
}
}
/* stage 3: induce the result for the original problem */
let mut c = vec![0; k];
let mut b = vec![0; k];
/* put all left-most S characters into their buckets */
get_counts(t, &mut c, n, k);
get_buckets(&mut c, &mut b, k, true);
for i in m..n {
sa[i] = 0;
}
for i in (0..m).rev() {
j = sa[i];
sa[i] = 0;
if b[t[j] as usize] > 0 {
b[t[j] as usize] -= 1;
sa[b[t[j] as usize]] = j;
}
}
if is_bwt {
pidx = compute_bwt(t, &mut sa, &mut c, &mut b, n, k);
} else {
induce_sa(t, &mut sa, &mut c, &mut b, n, k);
}
Ok(pidx)
}
pub fn saisxx(t: &StringT, mut sa: &mut SArray, n: usize, k: usize) -> Result<(), SuffixError> {
if k <= 0 {
return Err(SuffixError::InvalidLength);
}
if n == 1 {
sa[0] = 0;
return Ok(());
}
let fs = 0;
suffixsort(t, &mut sa, fs, n, k, false)?;
Ok(())
}
fn _saisxx_bwt(
t: &StringT,
u: &mut StringT,
sa: &mut SArray,
n: usize,
k: usize,
) -> Result<usize, SuffixError> {
if k <= 0 {
return Err(SuffixError::InvalidLength);
}
if n <= 1 {
if n == 1 {
u[0] = t[0];
}
return Ok(n);
}
let mut pidx = suffixsort(t, sa, 0, n, k, true)?;
u[0] = t[n - 1];
for i in 0..pidx {
u[i + 1] = char::from_u32(sa[i] as u32).unwrap(); // cast to char
}
for i in pidx + 1..n {
u[i] = char::from_u32(sa[i] as u32).unwrap();
}
pidx += 1;
Ok(pidx)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_induce_sa() {
let chars: Vec<_> = "abracadabra".chars().collect();
let mut c = vec![0; 256];
let mut b = vec![0; 256];
let mut sa = vec![0, 0, 3, 5, 7, 0, 0, 0, 0, 0, 0];
induce_sa(&chars, &mut sa, &mut b, &mut c, chars.len(), 256);
assert_eq!(sa, vec![10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]);
let mut sa = vec![0, 0, 7, 3, 5, 0, 0, 0, 0, 0, 0];
induce_sa(&chars, &mut sa, &mut b, &mut c, chars.len(), 256);
assert_eq!(sa, vec![10, 7, 0, 3, 5, 8, 1, 4, 6, 9, 2]);
}
#[test]
fn test_induce_sa_long() {
let string = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.".to_string();
let chars: Vec<_> = string.chars().collect();
let mut c = vec![0; 256];
let mut b = vec![0; 256];
let mut sa = vec![
5, 11, 14, 21, 27, 32, 35, 39, 48, 52, 64, 74, 80, 86, 90, 95, 99, 110, 119, 125, 130,
135, 141, 145, 152, 157, 160, 168, 176, 181, 183, 190, 193, 198, 202, 212, 215, 218,
223, 225, 230, 239, 245, 248, 252, 261, 265, 270, 275, 286, 290, 295, 299, 304, 309,
320, 333, 343, 355, 366, 369, 373, 385, 388, 392, 398, 403, 407, 415, 418, 427, 434,
445, 451, 457, 467, 471, 476, 485, 490, 498, 509, 518, 523, 529, 539, 549, 558, 561,
567, 108, 0, 0, 0, 0, 0, 0, 0, 0, 0, 148, 396, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 534, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 88, 113, 116, 185, 206, 220, 250, 302,
337, 351, 360, 371, 379, 412, 423, 439, 459, 462, 515, 0, 0, 0, 208, 501, 0, 0, 0, 139,
204, 234, 313, 358, 479, 542, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 67, 102, 526, 545, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 3, 29, 56, 58, 78, 127, 133, 155, 174, 188, 237, 283, 324, 326, 335,
347, 409, 425, 430, 449, 464, 537, 551, 565, 0, 0, 0, 0, 0, 512, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 505, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 16, 42, 45, 61, 137, 171, 257, 329, 340, 381, 400, 442, 487, 503,
520, 554, 0, 0, 0, 0, 0, 163, 494, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 268,
483, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 122, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
437, 556, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 375,
496, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71, 106, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69, 104, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];
assert_eq!(sa.len(), chars.len());
induce_sa(&chars, &mut sa, &mut b, &mut c, chars.len(), 256);
assert_eq!(
sa,
vec![
145, 392, 523, 5, 80, 451, 567, 245, 366, 418, 74, 445, 561, 529, 181, 223, 290,
157, 48, 198, 467, 90, 239, 286, 275, 434, 490, 21, 119, 309, 343, 130, 270, 183,
86, 248, 385, 539, 64, 99, 304, 11, 212, 299, 518, 218, 471, 261, 32, 190, 415,
558, 265, 457, 373, 39, 168, 498, 476, 333, 407, 202, 427, 14, 135, 509, 230, 110,
252, 27, 125, 35, 95, 141, 295, 388, 403, 215, 176, 193, 225, 52, 320, 355, 160,
549, 369, 152, 398, 485, 108, 151, 285, 332, 466, 573, 73, 244, 365, 148, 396, 149,
146, 393, 147, 395, 394, 524, 6, 81, 452, 568, 246, 367, 419, 0, 75, 446, 562, 534,
530, 182, 224, 531, 462, 337, 439, 220, 535, 185, 351, 291, 206, 158, 49, 199, 468,
113, 360, 302, 116, 515, 379, 88, 250, 371, 412, 423, 459, 91, 208, 501, 240, 287,
319, 139, 479, 276, 358, 234, 542, 435, 204, 313, 51, 118, 201, 211, 260, 384, 470,
364, 115, 491, 545, 22, 120, 526, 67, 102, 38, 98, 140, 144, 197, 222, 229, 274,
298, 391, 406, 414, 475, 517, 522, 533, 301, 411, 233, 312, 478, 210, 259, 383,
363, 92, 430, 409, 310, 3, 78, 449, 565, 335, 93, 155, 237, 347, 480, 277, 133,
174, 537, 551, 283, 464, 56, 324, 492, 344, 425, 420, 431, 58, 326, 131, 29, 127,
188, 34, 192, 417, 560, 271, 512, 47, 63, 342, 444, 508, 548, 331, 184, 532, 362,
463, 402, 489, 87, 249, 359, 37, 97, 143, 297, 390, 405, 154, 429, 505, 350, 318,
282, 520, 235, 16, 386, 137, 540, 65, 100, 45, 61, 340, 442, 506, 546, 329, 338,
440, 171, 42, 305, 554, 12, 381, 503, 213, 400, 487, 272, 257, 180, 243, 221, 521,
536, 163, 494, 378, 525, 300, 410, 311, 209, 187, 502, 519, 186, 352, 292, 543, 19,
268, 353, 483, 4, 10, 79, 85, 450, 456, 566, 572, 219, 336, 207, 236, 24, 122, 472,
17, 25, 123, 94, 156, 159, 167, 238, 387, 138, 357, 541, 50, 200, 469, 114, 66,
101, 46, 62, 341, 443, 507, 547, 330, 361, 317, 339, 441, 162, 267, 262, 164, 556,
437, 172, 348, 43, 481, 306, 278, 217, 294, 308, 33, 191, 416, 559, 511, 179, 242,
316, 266, 436, 555, 178, 241, 496, 375, 473, 1, 76, 447, 563, 263, 165, 303, 497,
458, 196, 228, 232, 55, 323, 18, 374, 40, 169, 7, 82, 453, 569, 499, 376, 134, 175,
538, 205, 422, 117, 474, 516, 477, 2, 77, 334, 408, 448, 564, 281, 41, 170, 380,
315, 552, 255, 106, 71, 13, 89, 109, 251, 372, 397, 433, 528, 557, 150, 284, 465,
461, 203, 413, 382, 57, 325, 346, 424, 428, 504, 15, 136, 553, 493, 293, 510, 231,
460, 345, 111, 69, 104, 8, 83, 454, 570, 253, 31, 129, 214, 247, 264, 289, 368,
426, 112, 438, 28, 126, 173, 401, 488, 36, 96, 142, 296, 389, 404, 349, 44, 60,
328, 482, 216, 307, 177, 495, 421, 314, 70, 105, 432, 59, 327, 279, 513, 194, 226,
53, 321, 500, 544, 377, 9, 84, 455, 571, 23, 121, 356, 161, 280, 254, 527, 68, 103,
288, 273, 258, 132, 550, 256, 370, 514, 153, 399, 486, 166, 30, 128, 20, 26, 124,
189, 269, 354, 484, 107, 72, 195, 227, 54, 322
]
);
}
}

16
src/types.rs Normal file
View File

@ -0,0 +1,16 @@
pub type Bucket = [usize];
pub type StringT = [char];
pub type SArray = [usize];
#[derive(Debug)]
pub enum SuffixError {
InvalidLength,
Internal,
IntConversion(std::num::TryFromIntError),
}
impl From<std::num::TryFromIntError> for SuffixError {
fn from(err: std::num::TryFromIntError) -> Self {
Self::IntConversion(err)
}
}