使用增强型后缀数组(ESA)的文本匹配算法
設模式串的長度為m,文本的長度為n,使用后綴數組做文本匹配,如果只用后綴表suftab和折半查找算法的復雜度為O(m*logn); 如果使用最長公共前綴表lcptab和折半查找算法,復雜度可以降至O(m+logn);使用增強型后綴數組(ESA)表childtab,復雜度為 O(m)。本文使用復雜度為O(m)的算法,在匹配之前要求先構造SA(下面采用DC3算法構造后綴數組),然后計算出后綴數組的 suftab,lcptab和childtab。由于通過childtab可以在O(1)時間復雜度內找到每一個lcp-interval的所有 child-interval,因此這跟后綴樹的自頂向下匹配模式串的算法復雜度相當;進一步通過lcp-interval可以很容易地得到z個匹配子串 的起始位置,復雜度為O(m+z)。
實現:
import java.util.Stack;/*** * * finding all z occurrences of a pattern with suftab,lcptab and childtab* (m: the length of pattern, n: the length of text)* time complexity: O(m+z)* * (The suffix array is constructed with DC3 algorithm)* * * Copyright (c) 2011 ljs (http://blog.csdn.net/ljsspace/)* Licensed under GPL (http://www.opensource.org/licenses/gpl-license.php) * * @author ljs* 2011-07-28**/ public class ESAPatternMatch {private String text;private int[] suftab;private int[] lcptable;private int[] childtab;private int len;public ESAPatternMatch(String text){this.text = text;this.len = text.length();childtab=new int[len]; for(int i=0;i<len;i++){childtab[i] = -1;}}public static final char MAX_CHAR = '\u00FF';class Suffix{int[] sa; //Note: the p-th suffix in sa: SA[rank[p]-1]];//p is the index of the array "rank", start with 0;//a text S's p-th suffix is S[p..n], n=S.length-1.int[] rank; boolean done;public Suffix(int[] sa,int[] rank){this.sa = sa;this.rank = rank;}}//a prefix of suffix[isuffix] represented with digitsclass Tuple{int isuffix; //the p-th suffixint[] digits;public Tuple(int suffix,int[] digits){this.isuffix = suffix;this.digits = digits; }public String toString(){StringBuffer sb = new StringBuffer(); sb.append(isuffix);sb.append("(");for(int i=0;i<digits.length;i++){sb.append(digits[i]);if(i<digits.length-1)sb.append("-");}sb.append(")");return sb.toString();}}//d: the digit to do countingsort//max: A value's range is 0...maxprivate void countingSort(int d,Tuple[] tA,Tuple[] tB,int max){//init the counter arrayint[] C = new int[max+1];for(int i=0;i<=max;i++){C[i] = 0;}//stat the countfor(int j=0;j<tA.length;j++){C[tA[j].digits[d]]++;}//process the counter array Cfor(int i=1;i<=max;i++){C[i]+=C[i-1];}//distribute the values for(int j=tA.length-1;j>=0;j--){//C[A[j]] <= A.length tB[--C[tA[j].digits[d]]]=tA[j]; }}//tA: input//tB: output for rank caculationprivate void radixSort(Tuple[] tA,Tuple[] tB,int max,int digitsLen){int len = tA.length;int digitsTotalLen = tA[0].digits.length;for(int d=digitsTotalLen-1,j=0;j<digitsLen;d--,j++){this.countingSort(d, tA, tB, max);//assign tB to tAif(j<digitsLen-1){for(int i=0;i<len;i++){tA[i] = tB[i];} }}}//max is the maximum value in any digit of TA.digits[], used for counting sort//tA: input//tB: the place holder, reused between iterationsprivate Suffix rank(Tuple[] tA,Tuple[] tB,int max,int digitsLen){ int len = tA.length; radixSort(tA,tB,max,digitsLen); int digitsTotalLen = tA[0].digits.length;//caculate rank and sa int[] sa = new int[len];sa[0] = tB[0].isuffix; int[] rank = new int[len+2]; //add 2 for sentinel rank[len]=1;rank[len+1] = 1;int r = 1; //rank starts with 1rank[tB[0].isuffix] = r; for(int i=1;i<len;i++){sa[i] = tB[i].isuffix; boolean equalLast = true;for(int j=digitsTotalLen-digitsLen;j<digitsTotalLen;j++){if(tB[i].digits[j]!=tB[i-1].digits[j]){equalLast = false;break;}}if(!equalLast){r++;}rank[tB[i].isuffix] = r; }Suffix suffix = new Suffix(sa,rank);//judge if we are doneif(r==len){suffix.done = true;}else{suffix.done = false;}return suffix;}private int[] orderSuffixes(Tuple[] tA,Tuple[] tB,int max,int digitsLen){ int len = tA.length; radixSort(tA,tB,max,digitsLen); //caculate rank and sa int[] sa = new int[len];for(int i=0;i<len;i++){sa[i] = tB[i].isuffix; }return sa; }//rank needs sentinel: len+2private Suffix reduce(int[] rank,int max){int len = rank.length - 2;int n1 = (len+1)/3;int n2 = len/3;Tuple[] tA = new Tuple[n1+n2];Tuple[] tB = new Tuple[n1+n2];for(int i=0,j=1;i<n1;i++,j+=3){int r1 = rank[j];int r2 = rank[j+1];int r3 = rank[j+2];tA[i] = new Tuple(i,new int[]{r1,r2,r3});}for(int i=n1,j=2;i<n1+n2;i++,j+=3){int r1 = rank[j];int r2 = rank[j+1];int r3 = rank[j+2]; tA[i] = new Tuple(i,new int[]{r1,r2,r3});}return rank(tA,tB,max,3); }private int[] skew(int[] rank,int max){int len = rank.length - 2;//step 1: caculate sa12Suffix suffixT12 = reduce(rank,max);int[] sa12 = null;if(!suffixT12.done){int[] rankT12 = suffixT12.rank;int maxT12 = rankT12[suffixT12.sa[suffixT12.sa.length-1]];sa12 = skew(rankT12,maxT12);// debug for string: GACCCACCACC#//s12 = new Suffix();//s12.rank = new int[]{3,6,5,4,7,2,1,1,1};//s12.sa = new int[]{7,6,5,0,3,2,1,4};//s12.done =true; }else{sa12 = suffixT12.sa; }//index conversion for sa12int n1 = (len+1)/3;for(int j=0;j<sa12.length;j++){if(sa12[j]<n1){sa12[j] = 1 + 3*sa12[j];}else{sa12[j] = 2 + 3*(sa12[j]-n1);} }//recaculate rank for sa12int[] rank12 = new int[len+2];rank12[len] = 1;rank12[len+1] = 1;for(int k=0;k<sa12.length;k++){rank12[sa12[k]] = k+1;}//step 2: caculate sa0 int n0=(len+2)/3;Tuple[] tA = new Tuple[n0];Tuple[] tB = new Tuple[n0];for(int i=0,j=0;i<n0;i++,j+=3){int r1 = rank[j];int r2 = rank12[j+1]; tA[i] = new Tuple(i,new int[]{r1,r2});}int max12 = rank12[sa12[sa12.length-1]]; int[] sa0 = orderSuffixes(tA,tB,max<max12?max12:max,2);//index conversion for sa0for(int j=0;j<n0;j++){sa0[j] = 3*sa0[j]; } //step 3: merge sa12 and sa0int[] sa = new int[len];int i=0,j=0;int k=0;while(i<sa12.length && j<sa0.length){int p = sa12[i];int q = sa0[j];if(p%3==1){//case 1if(rank[p]<rank[q]){sa[k++] = p;i++;}else if(rank[p]>rank[q]){sa[k++] = q;j++;}else{if(rank12[p+1]<rank12[q+1]){sa[k++] = p;i++;}else{sa[k++] = q;j++;} }}else{//case 2if(rank[p]<rank[q]){sa[k++] = p;i++;}else if(rank[p]>rank[q]){sa[k++] = q;j++;}else{if(rank[p+1]<rank[q+1]){sa[k++] = p;i++;}else if(rank[p+1]>rank[q+1]){sa[k++] = q;j++;}else{if(rank12[p+2]<rank12[q+2]){sa[k++] = p;i++;}else{sa[k++] = q;j++;} }}} }for(int m=i;m<sa12.length;m++){sa[k++] = sa12[m];}for(int m=j;m<sa0.length;m++){sa[k++] = sa0[m];} return sa; }//Precondition: the last char in text must be less than other chars.public Suffix DC3(String text) throws Exception{if(text == null)return null;int len = text.length();if(len == 0) return null;char base = text.charAt(len-1); //the smallest charTuple[] tA = new Tuple[len];Tuple[] tB = new Tuple[len]; //placeholderfor(int i=0;i<len;i++){int delta=text.charAt(i)-base;if(delta<0) throw new Exception("invalid input: last char must be the smallest one.");tA[i] = new Tuple(i,new int[]{0,delta});}Suffix suffix = rank(tA,tB,MAX_CHAR-base,1);int max = suffix.rank[suffix.sa[len-1]];int[] sa = skew(suffix.rank,max);//caculate rank for result suffix arrayint[] r = new int[len]; for(int k=0;k<sa.length;k++){r[sa[k]] = k+1;}return new Suffix(sa,r);}//rank[p]'s index starts with 1 (not 0)public int[] computeLCPTable(String text,int[] sa,int[] rank){if(text == null)return null;int len = text.length();if(len == 0) return null;int[] lcpz = new int[len];//base case: p=0//caculate LCP of suffix[0]int lcp = 0;int r = rank[0]-1;if(r>0){int q=sa[r-1];//caculate LCP by definitionfor(int i=0,j=q;i<len && j<len;i++,j++){if(text.charAt(i) != text.charAt(j)){lcp=i;break;}}}lcpz[0] = lcp;//other cases: p>=1//ignore p == sa[0] because LCP=0 for suffix[p] where rank[p]=0 for(int p=1;p<len && p != sa[0];p++){int h = lcpz[p-1];int q=sa[rank[p]-2];lcp = 0;if(h>1){ //for h<=1, caculate LCP by definition (i.e. start with lcp=0) //jump h-1 chars for suffix[p] and suffix[q] lcp = h-1; }for(int i=p+lcp,j=q+lcp,k=0;i<len && j<len;i++,j++,k++){if(text.charAt(i) != text.charAt(j)){lcp+=k;break;}}lcpz[p] = lcp;}//caculate LCPint[] LCP = new int[len];for(int i=0;i<len;i++){LCP[i] = lcpz[sa[i]];}return LCP;}public void buildChildtab(){//step 1: caculate up and down valueStack<Integer> stack = new Stack<Integer>();int lastIndex = -1;stack.push(0);for(int i=1;i<len;i++){ while(lcptable[i]<lcptable[stack.peek()]){lastIndex = stack.pop();int next = stack.peek();if(lcptable[i]<=lcptable[next] && lcptable[next] != lcptable[lastIndex]){childtab[next] = lastIndex; }}if(lastIndex != -1){childtab[i-1] = lastIndex;lastIndex = -1;}stack.push(i);}//process remaining elementswhile(0<lcptable[stack.peek()]){lastIndex = stack.pop();int next = stack.peek();if(0<=lcptable[next] && lcptable[next] != lcptable[lastIndex]){childtab[next] = lastIndex; }}//step 2: caculate nextLIndexstack.clear();stack.push(0);for(int i=1;i<len;i++){ while(lcptable[i]<lcptable[stack.peek()]){stack.pop(); }if(lcptable[i] == lcptable[stack.peek()]){ lastIndex = stack.pop();childtab[lastIndex] = i;}stack.push(i);} }//get i's up value; otherwise return -1private int getUpValue(int i){int up = -1;if(i>=1 && lcptable[i-1]>lcptable[i]){up = childtab[i-1];//up value}return up;}//get i's down value; otherwise return -1private int getDownValue(int i){int down = -1;if(childtab[i]>-1 && lcptable[childtab[i]] > lcptable[i]){ down = childtab[i]; //down value}return down;}//get the i's next L-index value; otherwise return -1private int getNextLIndexValue(int i){int nextLIndex = -1; if(i<len-1 && getUpValue(i+1)==-1&& getDownValue(i)==-1){//neither down value nor up valuenextLIndex = childtab[i]; }return nextLIndex;} //i and j is an lcp-interval private int getlcp(int i,int j){ if(i==j) return lcptable[i];int up = -1;if(j+1<len){up = getUpValue(j+1);}if(up>i && up<=j){return lcptable[up];}else{int down = getDownValue(i);if(down>-1)return lcptable[down];else return 0;}}private int getFirstLIndex(int i,int j){int up = -1;if(j+1<len){up = getUpValue(j+1);}int i1 = -1;if(up>i && up<=j){i1 = up;}else{i1 = getDownValue(i);} if(i1==-1){ //accommodate: i's doesn't belong to an lcp-intervali1 = getNextLIndexValue(i);}return i1;}//find the child-interval or singleton interval starting with char c//return null if not found//i and j is the parent lcp-interval private int[] getMatchedInterval(int i,int j,char c){int lcp = getlcp(i,j);int[] interval = null;int i1 = getFirstLIndex(i,j);//i..i1-1interval = getMatchedChildInterval(i,i1-1,c,lcp);if(interval != null)return interval;int nextLIndex = -1;while((nextLIndex = getNextLIndexValue(i1)) != -1){int i2 = nextLIndex; interval = getMatchedChildInterval(i1,i2-1,c,lcp); if(interval != null)return interval;i1 = i2;}interval = getMatchedChildInterval(i1,j,c,lcp); return interval;}private int[] getMatchedChildInterval(int i,int j,char c,int lcp){int[] interval = null;if(i==j){//singleton intervalif(text.charAt(this.suftab[i]+lcp)==c){interval = new int[]{getlcp(i,i),i,j};}}else{ if(text.charAt(this.suftab[i+1]+lcp)==c){interval = new int[]{getlcp(i,j),i,j};}}return interval;}public void enhanceSA() throws Exception{//prepare suftab, lcptab and childtabSuffix suffix = this.DC3(this.text); this.suftab = suffix.sa;int[] sufinv = suffix.rank; this.lcptable = this.computeLCPTable(this.text,this.suftab,sufinv);this.buildChildtab(); }//precondition: call this.enhanceSA() firstpublic void match(String pattern){ int m = pattern.length();if(m==0) return;int pos=0; boolean found = true;int i=0;int j=len-1;int[] interval = null;while(found && pos<m && (interval = getMatchedInterval(i,j,pattern.charAt(pos))) != null){ i=interval[1];j=interval[2];if(i!=j){int lcp=interval[0];int min=(lcp<m)?lcp:m;found=(pattern.substring(pos,min).equals(text.substring(this.suftab[j]+pos,this.suftab[j]+min)));pos=min; //if lcp<m and found, continue //if lcp>=m, exit whether found or not (pos==m)//if lcp<m but not found, exit (found=false)}else{ int tmp=this.suftab[i]+m;if(tmp>len) tmp=len;found=(pattern.substring(pos).equals(text.substring(this.suftab[i]+pos,tmp)));pos=m; //exit whether found or not (pos==m)}}if(found && interval != null){report(pattern,i,j);}else{System.out.format("pattern \"%s\" not found!%n",pattern);}}//i,j is an interval (child-interval or a singleton interval)private void report(String pattern,int i,int j){System.out.format("matched suffix(es) with pattern \"%s\": %d%n",pattern,j-i+1);for(int z=i;z<=j;z++){System.out.format("[%d:%s]%n",this.suftab[z],this.text.substring(this.suftab[z],this.text.length()-1));}}public static void main(String[] args) throws Exception { String text = "mississippi#";System.out.format("Text: %s %n",text.substring(0,text.length()-1));ESAPatternMatch esa = new ESAPatternMatch(text); esa.enhanceSA();esa.match("z"); esa.match("i"); esa.match("iss"); esa.match("issi"); esa.match("j"); esa.match("ir"); esa.match("ia");System.out.format("%n********************************%n"); text = "acaaacatat#"; System.out.format("Text: %s %n",text.substring(0,text.length()-1));esa = new ESAPatternMatch(text); esa.enhanceSA();esa.match("ac"); System.out.format("%n********************************%n"); text = "After a long text, here's a needle ZZZZZ\u0000"; System.out.format("Text: %s %n",text.substring(0,text.length()-1));esa = new ESAPatternMatch(text); esa.enhanceSA();esa.match("ZZZZZ"); System.out.format("%n********************************%n"); text = "The quick brown fox jumps over the lazy dog.\u0000"; System.out.format("Text: %s %n",text.substring(0,text.length()-1));esa = new ESAPatternMatch(text); esa.enhanceSA();esa.match("lazy"); System.out.format("%n********************************%n"); text = "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna...\u0000"; System.out.format("Text: %s %n",text.substring(0,text.length()-1));esa = new ESAPatternMatch(text); esa.enhanceSA();esa.match("tempor"); }}測試:
Text: mississippi
pattern "z" not found!
matched suffix(es) with pattern "i": 4
[10:i]
[7:ippi]
[4:issippi]
[1:ississippi]
matched suffix(es) with pattern "iss": 2
[4:issippi]
[1:ississippi]
matched suffix(es) with pattern "issi": 2
[4:issippi]
[1:ississippi]
pattern "j" not found!
pattern "ir" not found!
pattern "ia" not found!
********************************
Text: acaaacatat
matched suffix(es) with pattern "ac": 2
[0:acaaacatat]
[4:acatat]
********************************
Text: After a long text, here's a needle ZZZZZ
matched suffix(es) with pattern "ZZZZZ": 1
[35:ZZZZZ]
********************************
Text: The quick brown fox jumps over the lazy dog.
matched suffix(es) with pattern "lazy": 1
[35:lazy dog.]
********************************
Text: Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna...
matched suffix(es) with pattern "tempor": 1
[73:tempor incididunt ut labore et dolore magna...]
轉載于:https://www.cnblogs.com/ljsspace/archive/2011/07/28/2119050.html
總結
以上是生活随笔為你收集整理的使用增强型后缀数组(ESA)的文本匹配算法的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: ListView相关
- 下一篇: 用VB.net开发.NET Micro