📄 boyermoorestringsearch.java

📁 StandBayeMail
💻 JAVA
字号:
/** * <p>Title: StandBayeMail </p> * <p>Description: A bayesian spam filter</p> * <p>Copyright: Copyright (c) 2004 by Luca M. Viola</p> * <p>Company: 3AM.it</p> * @author Luca M. Viola <luca@3am.it> * @version 1.0  This program is free software; you can redistribute it and/or  modify it under the terms of the GNU General Public License  as published by the Free Software Foundation; either version 2  of the License, or (at your option) any later version.  This program is distributed in the hope that it will be useful,  but WITHOUT ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  GNU General Public License for more details.  You should have received a copy of the GNU General Public License  along with this program; if not, write to the Free Software  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.  This routine was taken and adapted to work on files from the original work  Copyright (C) 2000  Doyle B. Myers  http://www.blarg.com/~doyle/bmi.html*/package StandBayeMail;import java.util.*;import java.io.*;public class BoyerMooreStringSearch{  private static final int ALPHABET_SIZE = 0x10000; // UNICODE alphabet size (64K chars)  private int[] mBadCharSkipA; // one of the skip arrays (mismatched text char not in pattern)  private int[] mGoodSuffixSkipA; // the other skip array (suffix, mismatched text char found in pattern)  private char[] mPatternA; // chars of the pattern (what we're searching for)  private int mPatternLength; // length of pattern  private char[] mTextA; // chars of the text (the text we're searching in)  private int mTextLength; // length of text  //private RandomAccessFile raf;  private RandomAccessReader raf;  private String filename;  private boolean fromFile;  private char filemode=RandomAccessReader.MODE_UNBUFFERED;  public BoyerMooreStringSearch(String inPattern)  {    setPattern(inPattern);  }  /**	inText is the text that will be searched in   */  public void setText(String inText)  {    mTextA = inText.toCharArray();    mTextLength = mTextA.length;    fromFile=false;  }  public void setFile(String filename,char filemode)  {    this.filename=filename;    this.filemode=filemode;    fromFile=true;  }   /**	inText is the text that will be searched in   */  public void setFile(String filename)  {    setFile(filename,RandomAccessReader.MODE_UNBUFFERED);  }  /**	inPattern is the pattern that will be searched for   */  public void setPattern(String inPattern)  {    mPatternA = inPattern.toCharArray();    mPatternLength = mPatternA.length;    mBadCharSkipA = new int[ALPHABET_SIZE]; // set to zero for free    mGoodSuffixSkipA = new int[mPatternLength + 1]; // set to zero for free    // recompute these arrays whenever the pattern changes    computeGoodSuffixSkipArray();    computeBadCharSkipArray();  }  /**	Compute the good prefix skip array.   *	The resulting mGoodSuffixSkipA is one of the important Boyer-Moore   *	data structures. Print it out if you want to check program operation.   */  private void computeGoodSuffixSkipArray()  {    int i;    int j;    int p;    int[] f = new int[mPatternLength + 1];    j = mPatternLength + 1;    f[mPatternLength] = j;    for (i = mPatternLength; i > 0; i--)    {      while (j <= mPatternLength && mPatternA[i - 1] != mPatternA[j - 1])      {        if (mGoodSuffixSkipA[j] == 0)          mGoodSuffixSkipA[j] = j - i;        j = f[j];      }      f[i - 1] = --j;    }    p = f[0];    for (j = 0; j <= mPatternLength; ++j)    {      if (mGoodSuffixSkipA[j] == 0)        mGoodSuffixSkipA[j] = p;      if (j == p)        p = f[p];    }  }  /**	Compute the bad character skip array.   *	The resulting mBadCharSkipA is one of the important Boyer-Moore   *	data structures. Print it out if you want to check program operation.   */  private void computeBadCharSkipArray()  {    for (int a = 0; a < ALPHABET_SIZE; a++)      mBadCharSkipA[a] = mPatternLength;    for (int j = 0; j < mPatternLength - 1; j++)      mBadCharSkipA[mPatternA[j]] = mPatternLength - j - 1;  }  /**	If we are doing file searches here we open the file   *    as a RandomAccessFile, only for reading   *    This is handled automagically by the match() method.   */  private void openFile()  {    try    {      //raf = new RandomAccessFile(filename, "r");      raf = new RandomAccessReader(filename, filemode);      mTextLength = (int) raf.length();    }    catch (IOException ie)    {      ie.printStackTrace();    }  }  /**	If we are doing file searches here we close the   *    RandomAccessFile previously opened.   *    This is handled automagically by the match() method.   */  private void closeFile()  {    try    {      raf.close();    }    catch (IOException ie)    {      ie.printStackTrace();    }  }  /**	This routine helps finding a char either   *    into a String or into a Random Access Reader,   *    taking it from the position "pos"   */  private char getChar(int pos)  {    if (fromFile)    {      try      {        raf.seek(pos);        char ch=raf.readChar();        return ch;      }      catch (IOException ie)      {        return (char)0;      }    }    else      return mTextA[pos];  }  /**   *	Returns a Vector of all the matches.   */  public Vector match()  {    Vector v = new Vector();    int i;    int j;    if( fromFile ) openFile();    i = 0;    while (i <= mTextLength - mPatternLength)    {      // note: empty loop while chars match      for (j = mPatternLength - 1; j >= 0 && mPatternA[j] == getChar(i + j); --j) ;      if (j < 0)      { // off the left edge of the pattern, whole pattern matched        // System.out.println( "Pattern match at " + i );        v.addElement(new Integer(i)); // add find to vector        i += mGoodSuffixSkipA[0]; // always skip by suffix[0] after match        // makes sense, since we know the last        // comparison resulted in a match and was        // therefore found in the text (therefore we        // don't want to skip as a bad character);        // otherwise it's exactly like a good prefix        // mismatch at the 0'th position of the pattern.      }      else      { // character mismatch, skip        i += Math.max( // take the biggest of the two possible skips            mGoodSuffixSkipA[j + 1], // first one's suffix, second one's bad char            mBadCharSkipA[getChar(i + j)] - mPatternLength + j + 1            );      }    }    if( fromFile ) closeFile();    return v; // return all matches  }   public static int[] findMailBoxes( String filename,Vector v)   {     int count=0;     int [] result=new int[v.size()];     try     {       RandomAccessReader raf = new RandomAccessReader(filename, RandomAccessReader.MODE_UNBUFFERED);       Iterator i=v.iterator();       while( i.hasNext() )       {         Integer pos=(Integer)i.next();         raf.seek(pos.longValue());         String s=raf.readLine();         if( LexerUtils.isMboxHeader(s) )         {           result[count]=pos.intValue();           count++;         }       }     }     catch (IOException ie)     {       ie.printStackTrace();     }     System.out.println("#"+count+" mailboxes found");     return result;   }  public static void main( String args[])  {    long start=System.currentTimeMillis();    BoyerMooreStringSearch bmi=new BoyerMooreStringSearch("From ");    bmi.setFile("C:\\new.mbx");    //bmi.setText("If we want to this From scratch we have to start From the beginning.");    Vector v=bmi.match();    long end=System.currentTimeMillis();    System.out.println("#"+v.size()+" matches found in "+(int)((end-start)/1000)+" seconds.");    bmi.setPattern("\n");    Vector l=bmi.match();    long end2=System.currentTimeMillis();    System.out.println("#"+l.size()+" lines found in "+(int)((end2-end)/1000)+" seconds.");    int [] mailboxes=findMailBoxes("c:\\new.mbx",v);  }}
💿 文件大小 359 K
👤 上传用户 zhangpeng
📂 所属分类 Java编程
🏷️ 相关标签

#StandBayeMail
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -