📄 boyermoorestringsearch.java
字号:
/** * <p>Title: StandBayeMail </p> * <p>Description: A bayesian spam filter</p> * <p>Copyright: Copyright (c) 2004 by Luca M. Viola</p> * <p>Company: 3AM.it</p> * @author Luca M. Viola <luca@3am.it> * @version 1.0 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. This routine was taken and adapted to work on files from the original work Copyright (C) 2000 Doyle B. Myers http://www.blarg.com/~doyle/bmi.html*/package StandBayeMail;import java.util.*;import java.io.*;public class BoyerMooreStringSearch{ private static final int ALPHABET_SIZE = 0x10000; // UNICODE alphabet size (64K chars) private int[] mBadCharSkipA; // one of the skip arrays (mismatched text char not in pattern) private int[] mGoodSuffixSkipA; // the other skip array (suffix, mismatched text char found in pattern) private char[] mPatternA; // chars of the pattern (what we're searching for) private int mPatternLength; // length of pattern private char[] mTextA; // chars of the text (the text we're searching in) private int mTextLength; // length of text //private RandomAccessFile raf; private RandomAccessReader raf; private String filename; private boolean fromFile; private char filemode=RandomAccessReader.MODE_UNBUFFERED; public BoyerMooreStringSearch(String inPattern) { setPattern(inPattern); } /** inText is the text that will be searched in */ public void setText(String inText) { mTextA = inText.toCharArray(); mTextLength = mTextA.length; fromFile=false; } public void setFile(String filename,char filemode) { this.filename=filename; this.filemode=filemode; fromFile=true; } /** inText is the text that will be searched in */ public void setFile(String filename) { setFile(filename,RandomAccessReader.MODE_UNBUFFERED); } /** inPattern is the pattern that will be searched for */ public void setPattern(String inPattern) { mPatternA = inPattern.toCharArray(); mPatternLength = mPatternA.length; mBadCharSkipA = new int[ALPHABET_SIZE]; // set to zero for free mGoodSuffixSkipA = new int[mPatternLength + 1]; // set to zero for free // recompute these arrays whenever the pattern changes computeGoodSuffixSkipArray(); computeBadCharSkipArray(); } /** Compute the good prefix skip array. * The resulting mGoodSuffixSkipA is one of the important Boyer-Moore * data structures. Print it out if you want to check program operation. */ private void computeGoodSuffixSkipArray() { int i; int j; int p; int[] f = new int[mPatternLength + 1]; j = mPatternLength + 1; f[mPatternLength] = j; for (i = mPatternLength; i > 0; i--) { while (j <= mPatternLength && mPatternA[i - 1] != mPatternA[j - 1]) { if (mGoodSuffixSkipA[j] == 0) mGoodSuffixSkipA[j] = j - i; j = f[j]; } f[i - 1] = --j; } p = f[0]; for (j = 0; j <= mPatternLength; ++j) { if (mGoodSuffixSkipA[j] == 0) mGoodSuffixSkipA[j] = p; if (j == p) p = f[p]; } } /** Compute the bad character skip array. * The resulting mBadCharSkipA is one of the important Boyer-Moore * data structures. Print it out if you want to check program operation. */ private void computeBadCharSkipArray() { for (int a = 0; a < ALPHABET_SIZE; a++) mBadCharSkipA[a] = mPatternLength; for (int j = 0; j < mPatternLength - 1; j++) mBadCharSkipA[mPatternA[j]] = mPatternLength - j - 1; } /** If we are doing file searches here we open the file * as a RandomAccessFile, only for reading * This is handled automagically by the match() method. */ private void openFile() { try { //raf = new RandomAccessFile(filename, "r"); raf = new RandomAccessReader(filename, filemode); mTextLength = (int) raf.length(); } catch (IOException ie) { ie.printStackTrace(); } } /** If we are doing file searches here we close the * RandomAccessFile previously opened. * This is handled automagically by the match() method. */ private void closeFile() { try { raf.close(); } catch (IOException ie) { ie.printStackTrace(); } } /** This routine helps finding a char either * into a String or into a Random Access Reader, * taking it from the position "pos" */ private char getChar(int pos) { if (fromFile) { try { raf.seek(pos); char ch=raf.readChar(); return ch; } catch (IOException ie) { return (char)0; } } else return mTextA[pos]; } /** * Returns a Vector of all the matches. */ public Vector match() { Vector v = new Vector(); int i; int j; if( fromFile ) openFile(); i = 0; while (i <= mTextLength - mPatternLength) { // note: empty loop while chars match for (j = mPatternLength - 1; j >= 0 && mPatternA[j] == getChar(i + j); --j) ; if (j < 0) { // off the left edge of the pattern, whole pattern matched // System.out.println( "Pattern match at " + i ); v.addElement(new Integer(i)); // add find to vector i += mGoodSuffixSkipA[0]; // always skip by suffix[0] after match // makes sense, since we know the last // comparison resulted in a match and was // therefore found in the text (therefore we // don't want to skip as a bad character); // otherwise it's exactly like a good prefix // mismatch at the 0'th position of the pattern. } else { // character mismatch, skip i += Math.max( // take the biggest of the two possible skips mGoodSuffixSkipA[j + 1], // first one's suffix, second one's bad char mBadCharSkipA[getChar(i + j)] - mPatternLength + j + 1 ); } } if( fromFile ) closeFile(); return v; // return all matches } public static int[] findMailBoxes( String filename,Vector v) { int count=0; int [] result=new int[v.size()]; try { RandomAccessReader raf = new RandomAccessReader(filename, RandomAccessReader.MODE_UNBUFFERED); Iterator i=v.iterator(); while( i.hasNext() ) { Integer pos=(Integer)i.next(); raf.seek(pos.longValue()); String s=raf.readLine(); if( LexerUtils.isMboxHeader(s) ) { result[count]=pos.intValue(); count++; } } } catch (IOException ie) { ie.printStackTrace(); } System.out.println("#"+count+" mailboxes found"); return result; } public static void main( String args[]) { long start=System.currentTimeMillis(); BoyerMooreStringSearch bmi=new BoyerMooreStringSearch("From "); bmi.setFile("C:\\new.mbx"); //bmi.setText("If we want to this From scratch we have to start From the beginning."); Vector v=bmi.match(); long end=System.currentTimeMillis(); System.out.println("#"+v.size()+" matches found in "+(int)((end-start)/1000)+" seconds."); bmi.setPattern("\n"); Vector l=bmi.match(); long end2=System.currentTimeMillis(); System.out.println("#"+l.size()+" lines found in "+(int)((end2-end)/1000)+" seconds."); int [] mailboxes=findMailBoxes("c:\\new.mbx",v); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -