gismodel.java

来自「垃圾邮件过滤器源代码」· Java 代码 · 共 336 行
JAVA
336 行
///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2004 Jason Baldridge, Gann Bierner, and Tom Morton
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////
package opennlp.maxent;

import java.text.DecimalFormat;

/**
 * A maximum entropy model which has been trained using the Generalized
 * Iterative Scaling procedure (implemented in GIS.java).
 * 用GIS.java中的GIS算法训练得到的一个最大熵模型
 * @author      Tom Morton and Jason Baldridge
 * @version     $Revision: 1.13 $, $Date: 2004/06/11 20:51:44 $
 */
public final class GISModel implements MaxentModel {
  /** Mapping between outcomes and paramater values for each context.
   * The integer representation of the context can be found using <code>pmap</code>.*/
  /* 对每个上下文提供输出结果与参数值之间的映射.上下文的整数表示可以由pmap得到*/
  private final TIntParamHashMap[] params;
  /** Maping between predicates/contexts and an integer representing them. */
  /* 在断言/上下文与其整数表示之间进行映射*/
  private final TObjectIndexHashMap pmap;
  /** The names of the outcomes. 输出结果的字符串形式 */
  private final String[] ocNames;
  private final double correctionConstant; /* 校正常量C*/
  private final double correctionParam; /* 校正参数*/

  private final int numOutcomes; /* 输出结果的个数*/
  private final double iprob; /* 平均概率*/
  private final double fval;
  private DecimalFormat df;

  private int[] numfeats; // 每个输出对应的特征个数(有多少个特征产生此输出)??

  public GISModel(TIntParamHashMap[] _params,
                  String[] predLabels,
                  String[] _ocNames,
                  double _correctionConstant,
                  double _correctionParam) {

    // 把断言的字符串形式存入HASH表
    pmap = new TObjectIndexHashMap(predLabels.length);
    for (int i = 0; i < predLabels.length; i++) {
      pmap.put(predLabels[i], i);
    }

    // 把此函数的各种参数传递给当前的GISModel对象
    params = _params;
    ocNames = _ocNames;
    correctionConstant = _correctionConstant;
    correctionParam = _correctionParam;

    numOutcomes = ocNames.length;
    iprob = Math.log(1.0 / numOutcomes);
    fval = 1.0 / correctionConstant;
    numfeats = new int[numOutcomes];
  }

  /**
   * Use this model to evaluate a context and return an array of the likelihood of each outcome given that context.
   * 用此模型对给定的上下文进行估计,返回此上下文对每个输出的likelihood数组
   * @param context The names of the predicates which have been observed at
   *                the present decision point.
   * @return        The normalized probabilities for the outcomes given the
   *                context. The indexes of the double[] are the outcome
   *                ids, and the actual string representation of the
   *                outcomes can be obtained from the method getOutcome(int i).
   */
  public final double[] eval(String[] context) {
    return (eval(context, new double[numOutcomes]));
  }

  public final double[] eval(Predicate[] context) {
    return (eval(context, new double[numOutcomes]));
  }

  /**
   * Use this model to evaluate a context and return an array of the likelihood of each outcome given that context.
   *
   * @param context The names of the predicates which have been observed at
   *                the present decision point.
   * @param outsums This is where the distribution is stored.
   * @return        The normalized probabilities for the outcomes given the
   *                context. The indexes of the double[] are the outcome
   *                ids, and the actual string representation of the
   *                outcomes can be obtained from the method getOutcome(int i).
   */
  public final double[] eval(String[] context, double[] outsums) {
    int[] activeOutcomes;
    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] = iprob;
      numfeats[oid] = 0;
    }
    for (int i = 0; i < context.length; i++) {
      int contextIndex = pmap.get(context[i]);
      if (contextIndex >= 0) {
        TIntParamHashMap predParams = params[contextIndex];
        activeOutcomes = predParams.keys();
        for (int j = 0; j < activeOutcomes.length; j++) {
          int oid = activeOutcomes[j];
          numfeats[oid]++;
          outsums[oid] += predParams.get(oid);
        }
      }
    }

    double normal = 0.0;
    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] = Math.exp( (outsums[oid] * fval)
                              + ( (1.0 - (numfeats[oid] / correctionConstant))
                                 * correctionParam));
      normal += outsums[oid];
    }

    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] /= normal;

    }
    return outsums;
  }

  public double[] eval(Predicate[] context, double[] outsums) {

    int[] activeOutcomes;
    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] = iprob;
      numfeats[oid] = 0;
    }
    for (int i = 0; i < context.length; i++) {
      int contextIndex = pmap.get(context[i].word);
      if (contextIndex >= 0) {
        TIntParamHashMap predParams = params[contextIndex];
        activeOutcomes = predParams.keys();
        for (int j = 0; j < activeOutcomes.length; j++) {
          int oid = activeOutcomes[j];
          numfeats[oid]++;
          outsums[oid] += predParams.get(oid) * context[i].weight;
        }
      }
    }

    double normal = 0.0;
    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] = Math.exp( (outsums[oid] * fval)
                              + ( (1.0 - (numfeats[oid] / correctionConstant))
                                 * correctionParam));
      normal += outsums[oid];
    }

    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] /= normal;

    }
    return outsums;
  }

  /**
   * 返回参数ocs中具有最高likelihood值的输出结果对应的名字
   * Return the name of the outcome corresponding to the highest likelihood
   * in the parameter ocs.
   * @param ocs A double[] as returned by the eval(String[] context)
   *            method.
   * @return    The name of the most likely outcome.
   */
  public final String getBestOutcome(double[] ocs) {
    int best = 0;
    for (int i = 1; i < ocs.length; i++) {
      if (ocs[i] > ocs[best]) {
        best = i;
      }
    }
    return ocNames[best];
  }

  /**
   * 返回一个字符串，它表示所有输出结果的名字和由eval函数产生的此结果的概率之间的匹配关系
   *   输出结果1[对应的概率1]   输出结果2[对应的概率2] ……
   * Return a string matching all the outcome names with all the
   * probabilities produced by the <code>eval(String[] context)</code>
   * method.
   *
   * @param ocs A <code>double[]</code> as returned by the
   *            <code>eval(String[] context)</code>
   *            method.
   * @return    String containing outcome names paired with the normalized
   *            probability (contained in the <code>double[] ocs</code>)
   *            for each one.
   */
  public final String getAllOutcomes(double[] ocs) {
    if (ocs.length != ocNames.length) { //长度不同，模型无法匹配
      return "The double array sent as a parameter to GISModel.getAllOutcomes() must not have been produced by this model.";
    }
    else {
      if (df == null) { //lazy initilazation
        df = new DecimalFormat("0.0000");
      }
      StringBuffer sb = new StringBuffer(ocs.length * 2);
      sb.append(ocNames[0]).append("[").append(df.format(ocs[0])).append("]");
      for (int i = 1; i < ocs.length; i++) {
        sb.append("  ").append(ocNames[i]).append("[").append(df.format(ocs[i])).
            append("]");
      }
      return sb.toString();
      // 输出结果1[对应的概率1]   输出结果2[对应的概率2] ……
    }
  }

  /**
   * 根据一个整数ID，返回输出结果对应的名字
   * Return the name of an outcome corresponding to an int id.
   *
   * @param i An outcome id.
   * @return  The name of the outcome associated with that id.
   */
  public final String getOutcome(int i) {
    return ocNames[i];
  }

  /**
   * 根据给定输出结果的名字，返回其对应的索引
   * Gets the index associated with the String name of the given outcome.
   *
   * @param outcome the String name of the outcome for which the
   *          index is desired
   * @return the index if the given outcome label exists for this
   * model, -1 if it does not.
   **/
  public int getIndex(String outcome) {
    for (int i = 0; i < ocNames.length; i++) {
      if (ocNames[i].equals(outcome)) {
        return i;
      }
    }
    // 没有找到对应的结果，则其索引为-1
    return -1;
  }

  public int getNumOutcomes() {
    return (numOutcomes);
  }

  /**
   * Provides the fundamental data structures which encode the maxent model
   * information.  This method will usually only be needed by
   * GISModelWriters.  The following values are held in the Object array
   * which is returned by this method:
   * 提供对最大熵模型信息进行编码所需要的基本数据结构，此方法只被GISModelWriters调用。
   * 下面是此方法返回的对象数组的值：
   *
   * 0：模型参数数组
   * <li>index 0: gnu.trove.TIntDoubleHashMap[] containing the model parameters
   * 1：模型谓词与唯一的整数之间的映射
   * <li>index 1: java.util.Map containing the mapping of model predicates to unique integers
   * 2：字符串数组，表示所有输出结果的名字，以代表其在模型中唯一ID的数组中的索引存储
   * <li>index 2: java.lang.String[] containing the names of the outcomes,
   *            stored in the index of the array which represents their unique ids in the model.
   * 3：整数，表示模型的校正常量
   * <li>index 3: java.lang.Integer containing the value of the models correction constant
   * 4：Double,表示模型的校正参数
   * <li>index 4: java.lang.Double containing the value of the models correction parameter
   *
   * @return An Object[] with the values as described above.
   */
  public final Object[] getDataStructures() {
    Object[] data = new Object[5];
    data[0] = params;
    data[1] = pmap;
    data[2] = ocNames;
    data[3] = new Double(correctionConstant); //new Integer( (int) correctionConstant);
    data[4] = new Double(correctionParam);
    return data;
  }

  //调整最大熵模型的参数
  public final void AdjustMailModel(Predicate pred2Adjust, int mode, double step)
  {
    /**参数的说明
     * mode： 1 表示要把一个系统判别为S的邮件调整为L
             -1 表示要把一个系统判别为L的邮件调整为S
     * step: 调整的倍数
     */

    int predIndex = pmap.get(pred2Adjust.word); //确定要修改的Predicate是否存在
    if (predIndex == -1) {
      System.out.println("The Specified Predicate to adjust does not exist!");
      return;
    }

    TIntParamHashMap predParams = params[predIndex]; //获得要修改的Predicate的参数HASH表
    double[] predparams = new double[2]; //获得要修改的Predicate对各种输出结果的参数值
    predparams[0] = predParams.get(0);
    predparams[1] = predParams.get(1);
    for (int i = 0; i < predparams.length; i++) {
      System.out.println(predparams[i]);
    }

    if (ocNames[0].equals("S")) { //HASH表中第0个参数值表示对S的，第1个参数值表示对L的
      if (mode == 1) { //把S调整为L，增大特征对L的参数值,减小其对S的参数值
        predParams.adjustValue(1, Math.abs( (predparams[1])) * step + 1);
        predParams.adjustValue(0, -Math.abs(predparams[0]) * step - 1);
      }
      else if (mode == -1) { //把L调整为S，增大特征对S的参数值，减小其对L的参数值
        predParams.adjustValue(0, Math.abs(predparams[0]) * step + 1);
        predParams.adjustValue(1, -Math.abs(predparams[1]) * step - 1);
      }
    }
    else if (ocNames[0].equals("L")) { //HASH表中第0个参数值表示对L的，第1个参数值表示对S的
      if (mode == 1) { //把S调整为L，增大特征对L的参数值,减小其对S的参数值
        predParams.adjustValue(0, Math.abs( (predparams[0])) * step + 1);
        predParams.adjustValue(1, -Math.abs(predparams[1]) * step - 1);
      }
      else if (mode == -1) { //把L调整为S，增大特征对S的参数值，减小其对L的参数值
        predParams.adjustValue(1, Math.abs(predparams[1]) * step + 1);
        predParams.adjustValue(0, -Math.abs(predparams[0]) * step - 1);
      }
      else {
        System.out.println("AdjustMailModel Error!");
      }
    }
  }

}
gismodel.java - 源码说明

本页面展示了「垃圾邮件过滤器源代码」中的 gismodel.java 源码文件，采用 Java 编程语言编写，共 336 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与邮件过滤相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?