因為某些SA-IS的實作方式會利用傳入的陣列s的空間進行計算,因此傳入的陣列s必須是int範圍,而傳入的字串的尾端字元必須是最小的而且在之前完全沒有出現過,因此必須給字串先做以下處理:
假設要傳入字串 mississippi
先在尾端加入字元'#': mississippi#
算法結束後陣列sa為:11 10 7 4 1 0 9 8 6 3 5 2
將第一個數字11去除後剩下的數字即為字串mississippi的後綴數組:
10 7 4 1 0 9 8 6 3 5 2
關於SA-IS算法的論文請看這裡:
Two Efficient Algorithms for Linear Time Suffix Array Construction
關於SA-IS算法的教學(專利申請文)請看這裡:
后缀数组构造方法 CN 102081673 A
SA-IS的教學投影片:
https://www.cs.helsinki.fi/u/tpkarkka/opetus/11s/spa/lecture11.pdf
SA-IS中文教學:
https://riteme.github.io/blog/2016-6-19/sais.html
如果想對Suffix Array算法進行測試請使用這個Online Judge:
http://www.spoj.com/problems/SARRAY/
以下將會提供一些SA-IS的實做模板
1.台大黑暗code界的黑暗codebook:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include<string.h> | |
#define MAGIC(XD){\ | |
memset(sa,0,sizeof(int)*n);\ | |
memcpy(x,c,sizeof(int)*z);\ | |
XD\ | |
memcpy(x+1,c,sizeof(int)*(z-1));\ | |
for(i=0;i<n;++i){\ | |
if(sa[i]&&!t[sa[i]-1])sa[x[s[sa[i]-1]]++]=sa[i]-1;\ | |
}\ | |
memcpy(x,c,sizeof(int)*z);\ | |
for(i=n-1;i>=0;--i){\ | |
if(sa[i]&&t[sa[i]-1])sa[--x[s[sa[i]-1]]]=sa[i]-1;\ | |
}\ | |
} | |
void sais(int *s,int *sa,int *p,bool *t,int *c,int n,int z){ | |
bool neq=t[n-1]=1; | |
int nn=0,nmxz=-1,*q=sa+n,*ns=s+n,*x=c+z,lst=-1,i,j; | |
memset(c,0,sizeof(int)*z); | |
for(i=0;i<n;++i)++c[s[i]]; | |
for(i=0;i<z-1;++i)c[i+1]+=c[i]; | |
for(i=n-2;i>=0;i--)t[i]=(s[i]==s[i+1]?t[i+1]:s[i]<s[i+1]); | |
MAGIC( | |
for(i=1;i<n;++i){ | |
if(t[i]&&!t[i-1])sa[--x[s[i]]]=p[q[i]=nn++]=i; | |
} | |
); | |
for(i=0;i<n;++i)if((j=sa[i])&&t[j]&&!t[j-1]){ | |
neq=lst<0||memcmp(s+j,s+lst,(p[q[j]+1]-j)*sizeof(int)); | |
ns[q[lst=j]]=nmxz+=neq; | |
} | |
if(nmxz==nn-1)for(i=0;i<nn;++i)q[ns[i]]=i; | |
else sais(ns,q,p+nn,t+n,c+z,nn,nmxz+1); | |
MAGIC( | |
for(i=nn-1;i>=0;--i)sa[--x[s[p[q[i]]]]]=p[q[i]]; | |
); | |
} | |
#undef MAGIC | |
/*****************這些是需要用到的陣列大小**************/ | |
static const int MXN=10000000; | |
int s[MXN*2+5],sa[MXN*2+5],p[MXN+5],c[MXN*2+5]; | |
bool t[MXN*2+5]; |
而這裡的MXN則是字串的最長長度(假設字元數<字串長度)
2.卦長自行實作的模板(記憶體用量少):
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define pushS(x) sa[--bkt[s[x]]] = x | |
#define pushL(x) sa[bkt[s[x]]++] = x | |
#define induce_sort(v){\ | |
fill_n(sa,n,0);\ | |
copy_n(_bkt,A,bkt);\ | |
for(i=n1-1;~i;--i)pushS(v[i]);\ | |
copy_n(_bkt,A-1,bkt+1);\ | |
for(i=0;i<n;++i)if(sa[i]&&!t[sa[i]-1])pushL(sa[i]-1);\ | |
copy_n(_bkt,A,bkt);\ | |
for(i=n-1;~i;--i)if(sa[i]&&t[sa[i]-1])pushS(sa[i]-1);\ | |
} | |
template<typename T> | |
void sais(const T s,int n,int *sa,int *_bkt,int *p,bool *t,int A){ | |
int *rnk=p+n,*s1=p+n/2,*bkt=_bkt+A; | |
int n1=0,i,j,x=t[n-1]=1,y=rnk[0]=-1,cnt=-1; | |
for(i=n-2;~i;--i)t[i]=(s[i]==s[i+1]?t[i+1]:s[i]<s[i+1]); | |
for(i=1;i<n;++i)rnk[i]=t[i]&&!t[i-1]?(p[n1]=i,n1++):-1; | |
fill_n(_bkt,A,0); | |
for(i=0;i<n;++i)++_bkt[s[i]]; | |
for(i=1;i<A;++i)_bkt[i]+=_bkt[i-1]; | |
induce_sort(p); | |
for(i=0;i<n;++i)if(~(x=rnk[sa[i]])) | |
j=y<0||memcmp(s+p[x],s+p[y],(p[x+1]-p[x])*sizeof(s[0])) | |
,s1[y=x]=cnt+=j; | |
if(cnt+1<n1)sais(s1,n1,sa,bkt,rnk,t+n,cnt+1); | |
else for(i=0;i<n1;++i)sa[s1[i]]=i; | |
for(i=0;i<n1;++i)s1[i]=p[sa[i]]; | |
induce_sort(s1); | |
} | |
/*****************這些是需要用到的陣列大小**************/ | |
const int MAXN=10000005,MAXA='z'+1; | |
int sa[MAXN],bkt[MAXN+MAXA],p[MAXN*2]; | |
bool t[MAXN*2]; | |
char s[MAXN]; |
這裡的MXN則是字串的最長長度(假設字元數<字串長度)
3.論文提供的實作code:
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/*SA-IS Algorithm*/ | |
const unsigned char mask[]={0x80,0x40,0x20,0x10,0x08,0x04,0x02,0x01}; | |
#define tget(i) ( (t[(i)/8]&mask[(i)%8])?1:0) | |
#define tset(i, b) t[(i)/8]=(b)?(mask[(i)%8]|t[(i)/8]):((~mask[(i)%8])&t[(i)/8]) | |
#define chr(i) (cs==sizeof(int)?((int*)s)[i]:((unsigned char *)s)[i]) | |
#define isLMS(i) (i>0&&tget(i)&&!tget(i-1)) | |
// find the start or end of each bucket | |
inline void getBuckets(unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
int i,sum=0; | |
// clear all buckets | |
for(i=0;i<=K;++i)bkt[i]=0; | |
// compute the size of each bucket | |
for(i=0;i<n;++i)++bkt[chr(i)]; | |
for(i=0;i<=K;++i){ | |
sum+=bkt[i]; | |
bkt[i]=end?sum:sum-bkt[i]; | |
} | |
} | |
// compute SAl | |
inline void induceSAl(unsigned char *t,int *SA,unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
int i,j; | |
// find starts of buckets | |
getBuckets(s,bkt,n,K,cs,end); | |
for(i=0;i<n;++i){ | |
j=SA[i]-1; | |
if(j>=0&&!tget(j))SA[bkt[chr(j)]++]=j; | |
} | |
} | |
// compute SAs | |
inline void induceSAs(unsigned char *t,int *SA,unsigned char *s,int *bkt,int n,int K,int cs,bool end){ | |
int i,j; | |
// find ends of buckets | |
getBuckets(s,bkt,n,K,cs,end); | |
for(i=n-1;i>=0;--i){ | |
j=SA[i]-1; | |
if(j>=0&&tget(j))SA[--bkt[chr(j)]]=j; | |
} | |
} | |
// find the suffix array SA of s[0..n-1] in {1..K}?n | |
// require s[n-1]=0 (the sentinel!), n>=2 | |
// use a working space (excluding s and SA) of | |
// at most 2.25n+O(1) for a constant alphabet | |
void SA_IS(unsigned char *s,int *SA,int n,int K,short cs){ | |
// LS-type array in bits | |
unsigned char *t=(unsigned char *)malloc(n/8+1); | |
int i,j; | |
// classify the type of each character | |
// the sentinel must be in s1, important!!! | |
tset(n-2,0);tset(n-1,1); | |
for(i=n-3;i>=0;--i) | |
tset(i,(chr(i)<chr(i+1)||(chr(i)==chr(i+1)&&tget(i+1)==1))?1:0); | |
// stage 1: reduce the problem by at least 1/2 | |
// sort all the S-substrings | |
// bucket array | |
int *bkt=(int *)malloc(sizeof(int)*(K+1)); | |
// find ends of buckets | |
getBuckets(s,bkt,n,K,cs,true); | |
for(i=0;i<n;++i)SA[i]=-1; | |
for(i=1;i<n;++i)if(isLMS(i))SA[--bkt[chr(i)]]=i; | |
induceSAl(t,SA,s,bkt,n,K,cs,false); | |
induceSAs(t,SA,s,bkt,n,K,cs,true); | |
free(bkt); | |
// compact all the sorted substrings into | |
// the first n1 items of SA | |
// 2*n1 must be not larger than n (proveable) | |
int n1=0; | |
for(i=0;i<n;++i)if(isLMS(SA[i]))SA[n1++]=SA[i]; | |
// find the lexicographic names of substrings | |
// init the name array buffer | |
for(i=n1;i<n;++i)SA[i]=-1; | |
int name=0,prev=-1; | |
for(i=0;i<n1;++i){ | |
int pos=SA[i];bool diff=false; | |
for(int d=0;d<n;++d) | |
if(prev==-1||chr(pos+d)!=chr(prev+d)||tget(pos+d)!=tget(prev+d)){ | |
diff=true;break; | |
}else if(d>0&&(isLMS(pos+d)||isLMS(prev+d)))break; | |
if(diff){ | |
++name;prev=pos; | |
} | |
pos=(pos%2==0)?pos/2:(pos-1)/2; | |
SA[n1+pos]=name-1; | |
} | |
for(i=n-1,j=n-1;i>=n1;--i)if(SA[i]>=0)SA[j--]=SA[i]; | |
// stage 2: solve the reduced problem | |
// recurse if names are not yet unique | |
int *SA1=SA,*s1=SA+n-n1; | |
if(name<n1)SA_IS((unsigned char*)s1,SA1,n1,name-1,sizeof(int)); | |
else // generate the suffix array of s1 directly | |
for(i=0;i<n1;++i)SA1[s1[i]]=i; | |
// stage 3: induce the result for | |
// the original problem | |
// bucket array | |
bkt=(int *)malloc(sizeof(int)*(K+1)); | |
// put all the LMS characters into their buckets | |
// find ends of buckets | |
getBuckets(s,bkt,n,K,cs,true); | |
for(i=1,j=0;i<n;++i)if(isLMS(i))s1[j++]=i; // get p1 | |
// get index in s | |
for(i=0;i<n1;++i)SA1[i]=s1[SA1[i]]; | |
// init SA[n1..n-1] | |
for(i=n1;i<n;++i)SA[i]=-1; | |
for(i=n1-1;i>=0;--i){ | |
j=SA[i];SA[i]=-1; | |
SA[--bkt[chr(j)]]=j; | |
} | |
induceSAl(t,SA,s,bkt,n,K,cs,false); | |
induceSAs(t,SA,s,bkt,n,K,cs,true); | |
free(bkt);free(t); | |
} |
4.超快記憶體使用超少的模板庫code:
https://gist.github.com/jacky860226/1d33adad858eef71bfe18120d8d69e6d#file-sa-is-very-fast-cpp
因為長度太長所以就直接貼上網址了,沒有人會在比賽時寫這種東西