📄 lexer.java

📁 StandBayeMail
💻 JAVA
字号:
/** * <p>Title: StandBayeMail </p> * <p>Description: A bayesian spam filter</p> * <p>Copyright: Copyright (c) 2004 by Luca M. Viola</p> * <p>Company: 3AM.it</p> * @author Luca M. Viola <luca@3am.it> * @version 1.0  This program is free software; you can redistribute it and/or  modify it under the terms of the GNU General Public License  as published by the Free Software Foundation; either version 2  of the License, or (at your option) any later version.  This program is distributed in the hope that it will be useful,  but WITHOUT ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the  GNU General Public License for more details.  You should have received a copy of the GNU General Public License  along with this program; if not, write to the Free Software  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.*/package StandBayeMail;import java.io.*;import java.util.Vector;import java.util.StringTokenizer;import java.util.Enumeration;import java.util.Iterator;public class Lexer implements Enumeration{  public static final int MAIL_TYPE_MAILBOX=0;  public static final int MAIL_TYPE_MAILDIR=1;  public static final int MSGSEC_TYPE_ENVELOPE=0;  public static final int MSGSEC_TYPE_HDRS=1;  public static final int MSGSEC_TYPE_BODY=2;  private String filename;  private Vector lineList;  private Vector tokenList;  private int tokenPos=0;  private int tokenMax=0;  private int numMessages=0;  private int [] mboxpos;  private boolean fromFile=false;  private int mboxtype;  private int count=0;  public Object nextElement()  {      tokenPos++;      if( tokenPos<tokenMax )        return tokenList.elementAt(tokenPos);      else      {        tokenPos=tokenMax;        throw new LexerException("No more elements in Lexer Enumeration.");      }  }  public boolean hasMoreElements()  {    if( tokenPos<tokenMax-1 )      return true;    return false;  }  private String getMailMessage(int idx)  {    String ret=null;    long pos1,pos2;    pos1=mboxpos[idx];    pos2=mboxpos[idx+1];    if( pos1==-1 || pos2==-1 ) return null;    int size=((int)pos2-(int)pos1);    try    {      RandomAccessFile r=new RandomAccessFile(filename,"r");      r.seek(pos1);      byte [] buff=new byte[size];      int rd=r.read(buff);      if( rd>=0 )        ret=new String(buff);      else ret=null;      r.close();    }    catch(IOException ie)    {      ie.printStackTrace();    }    return ret;  }  private void buildTokenList(String mail,Vector tokenList) throws LexerException  {    String line;    if( mail==null ) return;    StringTokenizer st=new StringTokenizer(mail,"\n");    while (st.hasMoreTokens())    {      //line=(String)lineList.elementAt(i);      line = st.nextToken();      if (LexerUtils.isMboxHeader(line))      {        numMessages++;        //System.out.println("Message #"+numMessages+" / "+line);        continue;      }      if (LexerUtils.isBase64(line) ||          LexerUtils.isIgnoredHeader(line) ||          LexerUtils.isMailerId(line) ||          LexerUtils.isMimeBoundary(line) ||          LexerUtils.isSpamText(line)) continue;      // I know, it should be a StringBuffer.. Unfortunately      // what I'd gain with append() I'd loose with plenty      // of toString() back and forth.      String token = "";      for (int j = 0; j < line.length(); j++)      {        String ch = line.substring(j, j + 1);        if (UnicodeCharacterUtils.getUtils().isAlphaNumeric(ch)) token += ch;        else        {          token = token.toLowerCase();          int len = token.length();          if (len >= 3 && len <= 20)          {            if (LexerUtils.isHtmlTag(token) ||                LexerUtils.isHtmlComment(token) ||                LexerUtils.isSmtpId(token) ||                LexerUtils.isBoundaryEqual(token) ||                LexerUtils.isNameEqual(token) ||                LexerUtils.isNumber(token) ||                LexerUtils.isFileNameEqual(token))            {              token = "";              continue;            }            else            {              int pos = token.length() - 1;              while (pos >= 3 && !LexerUtils.isWordendChar(token.charAt(pos)))                  pos--;              token = token.substring(0, pos + 1);              if (LexerUtils.isNumber(token))                continue;              pos = 0;              if( !LexerUtils.isIpAddr(token) )              {                while( pos<token.length() && UnicodeCharacterUtils.getUtils().isNumeric(token.charAt(pos)) ) pos++;                token=token.substring(pos,token.length());              }              token=token.trim();              if( token.equals("") ) continue;              tokenList.addElement(token);            }          }          token = "";        }      }    }  }  private void buildTokenListFile() throws LexerException  {    // This vector on huge mailboxes has the tendency to grow very large.    // This could be a problem if you haven't got enough memory.    tokenList=new Vector();    for( int i=0; mboxpos[i]!=-1; i++ )    {      String mail=getMailMessage(i);      if( mail==null ) break;      buildTokenList(mail,tokenList);      float progress=((float)i/(float)(count))*100F;      System.out.print("\rNow doing lexer analisys... ["+((int)progress+1)+"% done]");    }    tokenMax=tokenList.size();    tokenPos=0;  }  public Lexer( String filename,int mboxtype ) throws LexerException  {    this.mboxtype=mboxtype;    this.filename=filename;    this.fromFile=true;    System.out.print("Indexing mbox file...");    BoyerMooreStringSearch bmi=new BoyerMooreStringSearch("From ");    bmi.setFile(filename,StandBayeMail.getFileMode());    Vector v=bmi.match();    System.out.println(" [done]");    System.out.print("Searching mail messages...");    int count=0;    mboxpos=new int[v.size()+2];    for( int i=0; i<mboxpos.length; i++ ) mboxpos[i]=-1;    try    {      RandomAccessReader raf = new RandomAccessReader(filename, StandBayeMail.getFileMode());      Iterator i=v.iterator();      while( i.hasNext() )      {        Integer pos=(Integer)i.next();        raf.seek(pos.longValue());        String s=raf.readLine();        if( LexerUtils.isMboxHeader(s) )        {          mboxpos[count]=pos.intValue();          count++;        }      }      mboxpos[count]=(int)raf.length();      raf.close();      System.out.println(" [done]");    }    catch (IOException ie)    {      ie.printStackTrace();    }    System.out.println("#"+count+" mails found");    this.count=count;    //Garbage-collector friendly instructions    v=null;    bmi=null;    buildTokenListFile();  }  public Lexer( String msgbody ) throws LexerException  {    tokenList=new Vector();    try    {      buildTokenList(msgbody,tokenList);      tokenMax=tokenList.size();      tokenPos=0;    }    catch( Exception ex )    {      if( ex instanceof LexerException )        throw (LexerException)ex;      ex.printStackTrace();    }  }  public int getMessageNum()  {    return numMessages;  }  // UNIT TEST  public static void main( String args[] )  {    try    {      Lexer a=new Lexer( args[0] , MAIL_TYPE_MAILBOX );    }    catch( Exception e)    {      e.printStackTrace();    }  }}
💿 文件大小 359 K
👤 上传用户 zhangpeng
📂 所属分类 Java编程
🏷️ 相关标签

#StandBayeMail
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -