jasenmap.java

来自「spam source codejasen-0.9jASEN - java An」· Java 代码 · 共 229 行

JAVA
229
字号
/*
 * @(#)JasenMap.java	3/11/2004
 *
 * Copyright (c) 2004, 2005  jASEN.org
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1. Redistributions of source code must retain the above copyright notice,
 *      this list of conditions and the following disclaimer.
 *
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in
 *      the documentation and/or other materials provided with the distribution.
 *
 *   3. The names of the authors may not be used to endorse or promote products
 *      derived from this software without specific prior written permission.
 *
 *   4. Any modification or additions to the software must be contributed back
 *      to the project.
 *
 *   5. Any investigation or reverse engineering of source code or binary to
 *      enable emails to bypass the filters, and hence inflict spam and or viruses
 *      onto users who use or do not use jASEN could subject the perpetrator to
 *      criminal and or civil liability.
 *
 * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESSED OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JASEN.ORG,
 * OR ANY CONTRIBUTORS TO THIS SOFTWARE BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
 * OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */
package org.jasen.core.engine;

import java.io.Serializable;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

/**
 * <P>
 * 	This class represents the core data for the chi probability calculations used by jASEN.
 * </P>
 * <p>
 * 	During training, the engine will tokenize each spam or ham email into meaningful String tokens.
 *  Each token is then classified and a record added (or updated) in the JasenMap.
 * </p>
 * <p>
 * 	Once training is complete, the map holds all the pertinent information about all tokens (words) ever seen
 * 	by the engine.  This information is then accessed by the engine during a scan.
 * </p>
 * <p>
 * <strong>NOTE:</strong>
 * <br/>
 * For performance reasons the data contained in a map is loaded <em>into memory</em> by the engine.  Obviously
 * this means the bigger the data file (token map), the more memory is required to operate the engine.
 * </p>
 * <p>
 * Accurate message scanning can only be achieved with a well populated map, ideally generated from a wide
 * variety of spam/ham emails.  It is recommended that the training corpus for each type of email be not less than 1000
 * emails, and preferrably not less than 5000
 * </p>
 *
 * @author Jason Polites
 */
public class JasenMap implements Serializable
{
    static final long serialVersionUID = -2956615593555961715L;

    public static final int HAM = 0;
    public static final int SPAM = 1;

    private int hamObservations;
    private int spamObservations;

    private Map tokens;

    /**
     *
     */
    public JasenMap() {
        super ();
        tokens = new HashMap();
    }

    /**
     * Adds a token to the map.
     * <p>
     * If the token already exists, the relevant counter is simply incremented
     * </p>
     * @param key The word token
     * @param type One of JasenMap.HAM or JasenMap.SPAM
     */
    public void addToken(String key, int type) {

        JasenToken token = (JasenToken)tokens.get(key);

        if(token == null) {
            token = new JasenToken();
        }

        if(type == HAM) {
            token.incrementHam();
        }
        else
        {
            token.incrementSpam();
        }

        tokens.put(key, token);
    }

    /**
     * Gets the token associated with the key.
     * @param key The key, or word, to which this token corresponds
     * @return The token matching the key, or null if no such token exists
     */
    public JasenToken getToken(String key) {
        if(tokens == null) {
            return null;
        }
        else
        {
            return (JasenToken)tokens.get(key);
        }
    }

    /**
     * Called each time a new observation is made
     * @param type One of JasenMap.HAM or JasenMap.SPAM
     */
    public void incrementObservations(int type) {
        if(type == HAM) {
            hamObservations ++;
        }
        else
        {
            spamObservations++;
        }
    }

    /**
     * Gets the total number of observations in the map, both spam and ham
     * @return SPAM + HAM number of observations
     */
    public int getTotalObservations() {
        return hamObservations + spamObservations;
    }

    /**
     * Get the number of times a ham message (observation) has been encountered
     * @return An integer representing the number of ham messages in the map
     */
    public int getHamObservations() {
        return hamObservations;
    }

    /**
     * Sets the number of ham observations
     * @param hamObservations The number of ham observations in the map
     */
    public void setHamObservations(int hamObservations) {
        this.hamObservations = hamObservations;
    }

    /**
     * Gets the number of spam observations. That is, the number if messages represented in the map that were spam
     * @return An integer represeting the number of spam observations in the map
     */
    public int getSpamObservations() {
        return spamObservations;
    }

    /**
     * Sets the number of spam observations
     * @param spamObservations The number of ham observations in the map
     */
    public void setSpamObservations(int spamObservations) {
        this.spamObservations = spamObservations;
    }
    
    /**
     * Gets reference to the entire map of tokens.
     * Each element in the map is keyed on a word, and tied to a JasenToken object
     * @return The total token map.
     * @see JasenToken
     */
    public Map getTokens() {
        return tokens;
    }

    
    /**
     * Sets the token map
     * @param tokens The token map to set.
     */
    public void setTokens(Map tokens) {
        this.tokens = tokens;
    }

    /**
     * Returns an iterator for the token keys.
     * <p>
     * Actual tokens must still be obtained via the getToken method
     * </p>
     * @return An Iterator into the keyset of the token map.  
     * Each value returned by Iterator will be a String representing the key of the JasenToken object
     * @see JasenMap#getToken(String)
     */
    public Iterator iterator() {
        return tokens.keySet().iterator();
    }

    /**
     * Gets the number of tokens in the map
     * @return The number of tokens
     */
    public int size() {
        return tokens.size();
    }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?