DaviesX

April 14th, 2012, 04:59 AM

I am trying to learn a linear time suffix sorting algorithm who uses suffix array that save a bunch of memory. But I have difficulties understand the its radix sorting, Can you explain to me how it works ?

And I have got a c++ implementation, but I don't even know how to call the interface =,=,

Here is the code,

// This programme is proposed to implement a o(n) time complexity DC3 algorithm

#include <stdio.h>

#include <stdlib.h>

void suffixArray ( int* s, int* SA, int n, int K );

int main ( int argv, char *argc[] )

{

// I know the author tried to write comprehensive code

// but I still don't get it, even to call it =,=

return 0;

}

inline bool leq ( int a1, int a2, int b1, int b2 ) // lexic. order for pairs

{

return (a1 < b1 || a1 == b1 && a2 <= b2);

}

// and triples

inline bool leq ( int a1, int a2, int a3, int b1, int b2, int b3 )

{

return ( a1 < b1 || a1 == b1 && leq ( a2, a3, b2, b3 ) );

}

// stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r

static void radixPass ( int* a, int* b, int* r, int n, int K )

{

// count occurrences

int* c = new int[K + 1]; // counter array

for (int i = 0; i <= K; i++)

c[i] = 0; // reset counters

for (int i = 0; i < n; i++)

c[ r[ a[i] ] ]++; // count occurrences

// exclusive prefix sums

for ( int i = 0, sum = 0; i <= K; i++ )

{

int t = c[i];

c[i] = sum;

sum += t;

}

for (int i = 0; i < n; i++)

b[ c[ r[ a[i] ] ]++ ] = a[i]; // sort

delete [] c;

}

// find the suffix array SA of s[0..n-1] in {1..K}^n

// require s[n] = s[n + 1] = s[n + 2] = 0, n >= 2

void suffixArray ( int* s, int* SA, int n, int K )

{

// Here I think n might be the length of orginal suffix

int n0 = (n + 2)/3,

n1 = (n + 1)/3,

n2 = n/3,

n02 = n0 + n2; // (2n + 2)/3 can be seen as the number of triples whose

// pos mod 3 != 0

// I think s12 might be the array that holds the suffixes whose

// pos mod 3 != 0

int* s12 = new int[n02 + 3];

s12[n02] = s12[n02 + 1] = s12[n02 + 2] = 0; // Last 3 would end up with 0

int* SA12 = new int[n02 + 3]; // suffix array for pos mod 3 != 0

SA12[n02] = SA12[n02 + 1] = SA12[n02 + 2] = 0;

int* s0 = new int[n0]; // whose pos mod 3 = 0

int* SA0 = new int[n0]; // suffix array for pos mod 3 = 0

// generate positions of mod 1 and mod 2 suffixes

// the "+(n0-n1)" adds a dummy mod 1 suffix if n%3 == 1

for ( int i = 0, j = 0; i < n + (n0 - n1); i++ )

if ( i % 3 != 0 )

s12[j++] = i;

// lsb radix sort the mod 1 and mod 2 triples

radixPass ( s12 , SA12, s + 2, n02, K );

radixPass ( SA12, s12 , s + 1, n02, K );

radixPass ( s12 , SA12, s + 0, n02, K );

// find lexicographic names of triples

int name = 0, c0 = -1, c1 = -1, c2 = -1;

for ( int i = 0; i < n02; i++ )

{

if ( s[ SA12[i] + 0 ] != c0 ||

s[ SA12[i] + 1 ] != c1 ||

s[ SA12[i] + 2 ] != c2 )

{

name ++;

c0 = s[ SA12[i] + 0 ];

c1 = s[ SA12[i] + 1 ];

c2 = s[ SA12[i] + 2 ] ;

}

if ( SA12[i] % 3 == 1 )

{

s12[ SA12[i]/3 ] = name; // left half

}

else

{

s12[ SA12[i]/3 + n0 ] = name; // right half

}

}

// recurse if names are not yet unique

if ( name < n02 )

{

suffixArray ( s12, SA12, n02, name );

// store unique names in s12 using the suffix array

for ( int i = 0; i < n02; i++ )

s12[SA12[i]] = i + 1;

}

else

{

// generate the suffix array of s12 directly

for ( int i = 0; i < n02; i++ )

SA12[s12[i] - 1] = i;

}

// stably sort the mod 0 suffixes from SA12 by their first character

for ( int i=0, j=0; i < n02; i++ )

if ( SA12[i] < n0)

s0[j++] = 3*SA12[i];

radixPass ( s0, SA0, s, n0, K );

// merge sorted SA0 suffixes and sorted SA12 suffixes

for ( int p = 0, t = n0 - n1, k = 0; k < n; k++ )

{

int i = GetI(); // pos of current offset 12 suffix

int j = SA0[p]; // pos of current offset 0 suffix

if ( SA12[t] < n0 ?

leq ( s[i], s12[SA12[t] + n0], s[j], s12[j/3] ) :

leq ( s[i], s[i + 1], s12[SA12[t] - n0 + 1], s[j], s[j + 1], s12[j/3 + n0] ) )

{

#define GetI() (SA12[t] < n0 ? SA12[t]*3 + 1 : (SA12[t] - n0)*3 + 2)

// suffix from SA12 is smaller

SA[k] = i;

t ++;

if ( t == n02 ) // done --- only SA0 suffixes left

{

for ( k++; p < n0; p++, k++ )

SA[k] = SA0[p];

}

}

else

{

SA[k] = j;

p ++;

if (p == n0) // done --- only SA12 suffixes left

{

for ( k ++; t < n02; t++, k++ )

SA[k] = GetI();

}

}

}

delete [] s12;

delete [] SA12;

delete [] SA0;

delete [] s0;

}

I have studied it for days, but doesn't make sense with it, I don't want to give up, can you please tell me what is what ?

Thank you !

And I have got a c++ implementation, but I don't even know how to call the interface =,=,

Here is the code,

// This programme is proposed to implement a o(n) time complexity DC3 algorithm

#include <stdio.h>

#include <stdlib.h>

void suffixArray ( int* s, int* SA, int n, int K );

int main ( int argv, char *argc[] )

{

// I know the author tried to write comprehensive code

// but I still don't get it, even to call it =,=

return 0;

}

inline bool leq ( int a1, int a2, int b1, int b2 ) // lexic. order for pairs

{

return (a1 < b1 || a1 == b1 && a2 <= b2);

}

// and triples

inline bool leq ( int a1, int a2, int a3, int b1, int b2, int b3 )

{

return ( a1 < b1 || a1 == b1 && leq ( a2, a3, b2, b3 ) );

}

// stably sort a[0..n-1] to b[0..n-1] with keys in 0..K from r

static void radixPass ( int* a, int* b, int* r, int n, int K )

{

// count occurrences

int* c = new int[K + 1]; // counter array

for (int i = 0; i <= K; i++)

c[i] = 0; // reset counters

for (int i = 0; i < n; i++)

c[ r[ a[i] ] ]++; // count occurrences

// exclusive prefix sums

for ( int i = 0, sum = 0; i <= K; i++ )

{

int t = c[i];

c[i] = sum;

sum += t;

}

for (int i = 0; i < n; i++)

b[ c[ r[ a[i] ] ]++ ] = a[i]; // sort

delete [] c;

}

// find the suffix array SA of s[0..n-1] in {1..K}^n

// require s[n] = s[n + 1] = s[n + 2] = 0, n >= 2

void suffixArray ( int* s, int* SA, int n, int K )

{

// Here I think n might be the length of orginal suffix

int n0 = (n + 2)/3,

n1 = (n + 1)/3,

n2 = n/3,

n02 = n0 + n2; // (2n + 2)/3 can be seen as the number of triples whose

// pos mod 3 != 0

// I think s12 might be the array that holds the suffixes whose

// pos mod 3 != 0

int* s12 = new int[n02 + 3];

s12[n02] = s12[n02 + 1] = s12[n02 + 2] = 0; // Last 3 would end up with 0

int* SA12 = new int[n02 + 3]; // suffix array for pos mod 3 != 0

SA12[n02] = SA12[n02 + 1] = SA12[n02 + 2] = 0;

int* s0 = new int[n0]; // whose pos mod 3 = 0

int* SA0 = new int[n0]; // suffix array for pos mod 3 = 0

// generate positions of mod 1 and mod 2 suffixes

// the "+(n0-n1)" adds a dummy mod 1 suffix if n%3 == 1

for ( int i = 0, j = 0; i < n + (n0 - n1); i++ )

if ( i % 3 != 0 )

s12[j++] = i;

// lsb radix sort the mod 1 and mod 2 triples

radixPass ( s12 , SA12, s + 2, n02, K );

radixPass ( SA12, s12 , s + 1, n02, K );

radixPass ( s12 , SA12, s + 0, n02, K );

// find lexicographic names of triples

int name = 0, c0 = -1, c1 = -1, c2 = -1;

for ( int i = 0; i < n02; i++ )

{

if ( s[ SA12[i] + 0 ] != c0 ||

s[ SA12[i] + 1 ] != c1 ||

s[ SA12[i] + 2 ] != c2 )

{

name ++;

c0 = s[ SA12[i] + 0 ];

c1 = s[ SA12[i] + 1 ];

c2 = s[ SA12[i] + 2 ] ;

}

if ( SA12[i] % 3 == 1 )

{

s12[ SA12[i]/3 ] = name; // left half

}

else

{

s12[ SA12[i]/3 + n0 ] = name; // right half

}

}

// recurse if names are not yet unique

if ( name < n02 )

{

suffixArray ( s12, SA12, n02, name );

// store unique names in s12 using the suffix array

for ( int i = 0; i < n02; i++ )

s12[SA12[i]] = i + 1;

}

else

{

// generate the suffix array of s12 directly

for ( int i = 0; i < n02; i++ )

SA12[s12[i] - 1] = i;

}

// stably sort the mod 0 suffixes from SA12 by their first character

for ( int i=0, j=0; i < n02; i++ )

if ( SA12[i] < n0)

s0[j++] = 3*SA12[i];

radixPass ( s0, SA0, s, n0, K );

// merge sorted SA0 suffixes and sorted SA12 suffixes

for ( int p = 0, t = n0 - n1, k = 0; k < n; k++ )

{

int i = GetI(); // pos of current offset 12 suffix

int j = SA0[p]; // pos of current offset 0 suffix

if ( SA12[t] < n0 ?

leq ( s[i], s12[SA12[t] + n0], s[j], s12[j/3] ) :

leq ( s[i], s[i + 1], s12[SA12[t] - n0 + 1], s[j], s[j + 1], s12[j/3 + n0] ) )

{

#define GetI() (SA12[t] < n0 ? SA12[t]*3 + 1 : (SA12[t] - n0)*3 + 2)

// suffix from SA12 is smaller

SA[k] = i;

t ++;

if ( t == n02 ) // done --- only SA0 suffixes left

{

for ( k++; p < n0; p++, k++ )

SA[k] = SA0[p];

}

}

else

{

SA[k] = j;

p ++;

if (p == n0) // done --- only SA12 suffixes left

{

for ( k ++; t < n02; t++, k++ )

SA[k] = GetI();

}

}

}

delete [] s12;

delete [] SA12;

delete [] SA0;

delete [] s0;

}

I have studied it for days, but doesn't make sense with it, I don't want to give up, can you please tell me what is what ?

Thank you !