jasenmap.java
来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 229 行
JAVA
229 行
/*
* @(#)JasenMap.java 3/11/2004
*
* Copyright (c) 2004, 2005 jASEN.org
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* 3. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* 4. Any modification or additions to the software must be contributed back
* to the project.
*
* 5. Any investigation or reverse engineering of source code or binary to
* enable emails to bypass the filters, and hence inflict spam and or viruses
* onto users who use or do not use jASEN could subject the perpetrator to
* criminal and or civil liability.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
* OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
* OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
package org.jasen.core.engine;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* <P>
* This class represents the core data for the chi probability calculations used by jASEN.
* </P>
* <p>
* During training, the engine will tokenize each spam or ham email into meaningful String tokens.
* Each token is then classified and a record added (or updated) in the JasenMap.
* </p>
* <p>
* Once training is complete, the map holds all the pertinent information about all tokens (words) ever seen
* by the engine. This information is then accessed by the engine during a scan.
* </p>
* <p>
* <strong>NOTE:</strong>
* <br/>
* For performance reasons the data contained in a map is loaded <em>into memory</em> by the engine. Obviously
* this means the bigger the data file (token map), the more memory is required to operate the engine.
* </p>
* <p>
* Accurate message scanning can only be achieved with a well populated map, ideally generated from a wide
* variety of spam/ham emails. It is recommended that the training corpus for each type of email be not less than 1000
* emails, and preferrably not less than 5000
* </p>
*
* @author Jason Polites
*/
public class JasenMap implements Serializable
{
static final long serialVersionUID = -2956615593555961715L;
public static final int HAM = 0;
public static final int SPAM = 1;
private int hamObservations;
private int spamObservations;
private Map tokens;
/**
*
*/
public JasenMap() {
super ();
tokens = new HashMap();
}
/**
* Adds a token to the map.
* <p>
* If the token already exists, the relevant counter is simply incremented
* </p>
* @param key The word token
* @param type One of JasenMap.HAM or JasenMap.SPAM
*/
public void addToken(String key, int type) {
JasenToken token = (JasenToken)tokens.get(key);
if(token == null) {
token = new JasenToken();
}
if(type == HAM) {
token.incrementHam();
}
else
{
token.incrementSpam();
}
tokens.put(key, token);
}
/**
* Gets the token associated with the key.
* @param key The key, or word, to which this token corresponds
* @return The token matching the key, or null if no such token exists
*/
public JasenToken getToken(String key) {
if(tokens == null) {
return null;
}
else
{
return (JasenToken)tokens.get(key);
}
}
/**
* Called each time a new observation is made
* @param type One of JasenMap.HAM or JasenMap.SPAM
*/
public void incrementObservations(int type) {
if(type == HAM) {
hamObservations ++;
}
else
{
spamObservations++;
}
}
/**
* Gets the total number of observations in the map, both spam and ham
* @return SPAM + HAM number of observations
*/
public int getTotalObservations() {
return hamObservations + spamObservations;
}
/**
* Get the number of times a ham message (observation) has been encountered
* @return An integer representing the number of ham messages in the map
*/
public int getHamObservations() {
return hamObservations;
}
/**
* Sets the number of ham observations
* @param hamObservations The number of ham observations in the map
*/
public void setHamObservations(int hamObservations) {
this.hamObservations = hamObservations;
}
/**
* Gets the number of spam observations. That is, the number if messages represented in the map that were spam
* @return An integer represeting the number of spam observations in the map
*/
public int getSpamObservations() {
return spamObservations;
}
/**
* Sets the number of spam observations
* @param spamObservations The number of ham observations in the map
*/
public void setSpamObservations(int spamObservations) {
this.spamObservations = spamObservations;
}
/**
* Gets reference to the entire map of tokens.
* Each element in the map is keyed on a word, and tied to a JasenToken object
* @return The total token map.
* @see JasenToken
*/
public Map getTokens() {
return tokens;
}
/**
* Sets the token map
* @param tokens The token map to set.
*/
public void setTokens(Map tokens) {
this.tokens = tokens;
}
/**
* Returns an iterator for the token keys.
* <p>
* Actual tokens must still be obtained via the getToken method
* </p>
* @return An Iterator into the keyset of the token map.
* Each value returned by Iterator will be a String representing the key of the JasenToken object
* @see JasenMap#getToken(String)
*/
public Iterator iterator() {
return tokens.keySet().iterator();
}
/**
* Gets the number of tokens in the map
* @return The number of tokens
*/
public int size() {
return tokens.size();
}
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?