📄 lexer.java
字号:
/** * <p>Title: StandBayeMail </p> * <p>Description: A bayesian spam filter</p> * <p>Copyright: Copyright (c) 2004 by Luca M. Viola</p> * <p>Company: 3AM.it</p> * @author Luca M. Viola <luca@3am.it> * @version 1.0 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.*/package StandBayeMail;import java.io.*;import java.util.Vector;import java.util.StringTokenizer;import java.util.Enumeration;import java.util.Iterator;public class Lexer implements Enumeration{ public static final int MAIL_TYPE_MAILBOX=0; public static final int MAIL_TYPE_MAILDIR=1; public static final int MSGSEC_TYPE_ENVELOPE=0; public static final int MSGSEC_TYPE_HDRS=1; public static final int MSGSEC_TYPE_BODY=2; private String filename; private Vector lineList; private Vector tokenList; private int tokenPos=0; private int tokenMax=0; private int numMessages=0; private int [] mboxpos; private boolean fromFile=false; private int mboxtype; private int count=0; public Object nextElement() { tokenPos++; if( tokenPos<tokenMax ) return tokenList.elementAt(tokenPos); else { tokenPos=tokenMax; throw new LexerException("No more elements in Lexer Enumeration."); } } public boolean hasMoreElements() { if( tokenPos<tokenMax-1 ) return true; return false; } private String getMailMessage(int idx) { String ret=null; long pos1,pos2; pos1=mboxpos[idx]; pos2=mboxpos[idx+1]; if( pos1==-1 || pos2==-1 ) return null; int size=((int)pos2-(int)pos1); try { RandomAccessFile r=new RandomAccessFile(filename,"r"); r.seek(pos1); byte [] buff=new byte[size]; int rd=r.read(buff); if( rd>=0 ) ret=new String(buff); else ret=null; r.close(); } catch(IOException ie) { ie.printStackTrace(); } return ret; } private void buildTokenList(String mail,Vector tokenList) throws LexerException { String line; if( mail==null ) return; StringTokenizer st=new StringTokenizer(mail,"\n"); while (st.hasMoreTokens()) { //line=(String)lineList.elementAt(i); line = st.nextToken(); if (LexerUtils.isMboxHeader(line)) { numMessages++; //System.out.println("Message #"+numMessages+" / "+line); continue; } if (LexerUtils.isBase64(line) || LexerUtils.isIgnoredHeader(line) || LexerUtils.isMailerId(line) || LexerUtils.isMimeBoundary(line) || LexerUtils.isSpamText(line)) continue; // I know, it should be a StringBuffer.. Unfortunately // what I'd gain with append() I'd loose with plenty // of toString() back and forth. String token = ""; for (int j = 0; j < line.length(); j++) { String ch = line.substring(j, j + 1); if (UnicodeCharacterUtils.getUtils().isAlphaNumeric(ch)) token += ch; else { token = token.toLowerCase(); int len = token.length(); if (len >= 3 && len <= 20) { if (LexerUtils.isHtmlTag(token) || LexerUtils.isHtmlComment(token) || LexerUtils.isSmtpId(token) || LexerUtils.isBoundaryEqual(token) || LexerUtils.isNameEqual(token) || LexerUtils.isNumber(token) || LexerUtils.isFileNameEqual(token)) { token = ""; continue; } else { int pos = token.length() - 1; while (pos >= 3 && !LexerUtils.isWordendChar(token.charAt(pos))) pos--; token = token.substring(0, pos + 1); if (LexerUtils.isNumber(token)) continue; pos = 0; if( !LexerUtils.isIpAddr(token) ) { while( pos<token.length() && UnicodeCharacterUtils.getUtils().isNumeric(token.charAt(pos)) ) pos++; token=token.substring(pos,token.length()); } token=token.trim(); if( token.equals("") ) continue; tokenList.addElement(token); } } token = ""; } } } } private void buildTokenListFile() throws LexerException { // This vector on huge mailboxes has the tendency to grow very large. // This could be a problem if you haven't got enough memory. tokenList=new Vector(); for( int i=0; mboxpos[i]!=-1; i++ ) { String mail=getMailMessage(i); if( mail==null ) break; buildTokenList(mail,tokenList); float progress=((float)i/(float)(count))*100F; System.out.print("\rNow doing lexer analisys... ["+((int)progress+1)+"% done]"); } tokenMax=tokenList.size(); tokenPos=0; } public Lexer( String filename,int mboxtype ) throws LexerException { this.mboxtype=mboxtype; this.filename=filename; this.fromFile=true; System.out.print("Indexing mbox file..."); BoyerMooreStringSearch bmi=new BoyerMooreStringSearch("From "); bmi.setFile(filename,StandBayeMail.getFileMode()); Vector v=bmi.match(); System.out.println(" [done]"); System.out.print("Searching mail messages..."); int count=0; mboxpos=new int[v.size()+2]; for( int i=0; i<mboxpos.length; i++ ) mboxpos[i]=-1; try { RandomAccessReader raf = new RandomAccessReader(filename, StandBayeMail.getFileMode()); Iterator i=v.iterator(); while( i.hasNext() ) { Integer pos=(Integer)i.next(); raf.seek(pos.longValue()); String s=raf.readLine(); if( LexerUtils.isMboxHeader(s) ) { mboxpos[count]=pos.intValue(); count++; } } mboxpos[count]=(int)raf.length(); raf.close(); System.out.println(" [done]"); } catch (IOException ie) { ie.printStackTrace(); } System.out.println("#"+count+" mails found"); this.count=count; //Garbage-collector friendly instructions v=null; bmi=null; buildTokenListFile(); } public Lexer( String msgbody ) throws LexerException { tokenList=new Vector(); try { buildTokenList(msgbody,tokenList); tokenMax=tokenList.size(); tokenPos=0; } catch( Exception ex ) { if( ex instanceof LexerException ) throw (LexerException)ex; ex.printStackTrace(); } } public int getMessageNum() { return numMessages; } // UNIT TEST public static void main( String args[] ) { try { Lexer a=new Lexer( args[0] , MAIL_TYPE_MAILBOX ); } catch( Exception e) { e.printStackTrace(); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -