📄 gistrainer.java~88~

📁 垃圾邮件过滤器源代码
💻 JAVA~88~
📖 第 1 页 / 共 2 页
字号:
12 下一页
/////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2001 Jason Baldridge and Gann Bierner
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////
package opennlp.maxent;

import gnu.trove.*;

/**
 * An implementation of Generalized Iterative Scaling.  The reference paper
 * for this implementation was Adwait Ratnaparkhi's tech report at the
 * University of Pennsylvania's Institute for Research in Cognitive Science,
 * and is available at <a href ="ftp://ftp.cis.upenn.edu/pub/ircs/tr/97-08.ps.Z"><code>ftp://ftp.cis.upenn.edu/pub/ircs/tr/97-08.ps.Z</code></a>.
 * GIS算法的实现过程.参考文献:Adwait Ratnaparkhi的文章
 * @author  Jason Baldridge
 * @version $Revision: 1.15 $, $Date: 2004/06/14 20:52:41 $
 */
class GISTrainer {

  // This can improve model accuracy, though training will potentially take
  // longer and use more memory.  Model size will also be larger.  Initial
  // testing indicates improvements for models built on small data sets and
  // few outcomes, but performance degradation for those with large data
  // sets and lots of outcomes.
  private boolean _simpleSmoothing = false;

  // 新增参数,作用??
  private boolean _useSlackParameter = false;
  private double sigma = 2.0;

  // If we are using smoothing, this is used as the "number" of
  // times we want the trainer to imagine that it saw a feature that it
  // actually didn't see.  Defaulted to 0.1.
  // 平滑量,对没有出现的特征的默认次数设置为0.1
  private double _smoothingObservation = 0.1;
  // 在训练过程中是否显示信息?
  private boolean printMessages = false;

  private int numTokens; // # of event tokens 所有不重复的事件的个数
  private int numPreds; // # of predicates 所有不同的断言的个数
  private int numOutcomes; // # of outcomes 所有可能的输出结果个数
  /** A global index variable for Tokens. 事件的全局索引变量 */
  private int TID;
  /** A global index variable for Predicates. 断言的全局索引变量*/
  private int PID;
  /** A global index variable for Outcomes. 输出结果的全局索引变量 */
  private int OID;
  /* A global index variable for Weights. 权重的全局索引变量 */
  private int WID;

  /** Records the array of predicates seen in each event. */
  /* 记录事件断言的数组,第一维长度为事件个数,表示事件,
   * 第二维长度为事件中断言的个数(允许事件使用不同的断言个数)
   * 数组元素表示事件中出现的断言的索引号*/
  private int[][] contexts;
  /* 记录事件断言权重的数组,第一维长度为事件个数,表示事件,
   * 第二维长度为事件中断言的个数(允许事件使用不同的断言个数)
   * 数组元素表示事件中出现的断言对应的权重的索引号,在AbstractDataIndexer中处理*/
  private int[][] weights;

  /** Records the array of outcomes seen in each event. */
  // 记录每个事件输出结果的数组,长度为训练集中不同事件的个数
  // 数组元素表示对应事件的输出结果的整数索引,其字符串形式在数组outcomeLabels中
  private int[] outcomes;

  // ??新版本中引入的一个数组,在建立predcount[]时用过一次,但作用和上面的outcomes完全相同,没必要!!
  // private int[] outcomeList;

  // records the num of times an event has been seen, paired to int[][] contexts
  // 与int[][] contexts对应,记录每个不同事件出现的次数,数组元素大于1,表示有重复事件出现
  // 数组的大小为所有不同事件的个数
  private int[] numTimesEventsSeen;

  /** Stores the String names of the outcomes.  The GIS only tracks outcomes
   as ints, and so this array is needed to save the model to disk and
   thereby allow users to know what the outcome was in human understandable terms. */
  /* 记录输出结果的字符串形式,GIS跟踪输出结果的整数形式(outcomes数组中),因此使用此数组建立输出结果
   * 的整数形式与字符串形式的对应关系,使输出结果可以被用户理解 */
  private String[] outcomeLabels;

  /** Stores the String names of the predicates. The GIS only tracks
   predicates as ints, and so this array is needed to save the model to
   disk and thereby allow users to know what the outcome was in human
   understandable terms. */
  /* 同上,记录断言的字符串形式.*/
  private String[] predLabels;

  // 权重的实际double值,与HASH表对应
  private double[] weightLabels;

  /** Stores the observed expected values of the features based on training data. */
  // 存储由训练数据提到的每个特征的经验期望值
  private TIntParamHashMap[] observedExpects;

  /** Stores the estimated parameter value of each predicate during iteration */
  // 迭代过程中存储每个断言的参数估计值
  private TIntParamHashMap[] params;

  /** Stores the expected values of the features based on the current models */
  // 基于当前的模型得到的特征的期望值
  private TIntParamHashMap[] modelExpects;

      /** The maximum number of features fired in an event. Usually refered to a C.*/
  // GIS过程中使用的特征的个数,GIS算法中的常量C(可能要引入修正特征)
  // private int constant;
  private double constant;
  /**  Stores inverse of constant, 1/C. */
  private double constantInverse;

  /** The correction parameter of the model. 模型的修正特征的参数*/
  private double correctionParam;
  /** Observed expectation of correction feature. 修正特征的期望值*/
  private double cfObservedExpect;
  /** A global variable for the models expected value of the correction feature. */
  /* 全局变量,存储修正特征的模型期望值*/
  private double CFMOD;

  private final double NEAR_ZERO = 0.01;
  // 迭代过程中当前参数与上一次迭代得到的参数之间的差的阈值，差 < 阈值,停止迭代.
  private final double LLThreshold = 0.0001;

  /** Stores the output of the current model on a single event durring
   *  training.  This we be reset for every event for every itteration.  */
  /* 存储训练过程中由当前模型得到每个事件的输出,每次迭代都更新*/
  double[] modelDistribution;
  /** Stores the number of features that get fired per event. */
  // 记录每个事件使用的特征个数的数组(允许各个事件使用不同的特征个数)
  int[] numfeats;
  /** Initial probability for all outcomes. 所有输出结果的初始概率,根据最大熵原理,所有输出具有平均概率*/
  double iprob;

  /** Make all values in an TIntDoubleHashMap return to 0.使HASH映射中的所有值归0 */
  private TDoubleFunction backToZeros = new TDoubleFunction() {
    public double execute(double arg) {
      return 0.0;
    }
  };

  /** Updates the expected values of the features based on the modelDistribution for this event values. */
  // 更新由modelDistribution得到的此事件特征的期望值(Adwait的参考文档中P14倒数第二行的计算公式)
  private TIntDoubleProcedure updateModelExpect = new TIntDoubleProcedure() {
    public boolean execute(int oid, double arg) {
      modelExpects[PID].put(oid,
                            arg +
                            (modelDistribution[oid] * numTimesEventsSeen[TID] ));
      System.out.println(weights[TID][PID]);
      return true;//* weightLabels[weights[TID][PID]]
    }
  };

  /** Updates the params based on the newly computed model expected values. */
  // 根据新求得的模型期望值,更新参数alpha(Adwait的参考文档中P14计算公式2.5)
  private TIntDoubleProcedure updateParams = new TIntDoubleProcedure() {
    // alpha[j][n+1]=alpha[j]*(经验期望值/模型当前期望值)的1/C次方
    // 其中的1/C作为指数,取对数后变为乘,此处没乘,提到外面过程eval中constantInverse,效果相同,减少乘法
    public boolean execute(int oid, double arg) {
      params[PID].put(oid,
                      arg +
                      (Math.log(observedExpects[PID].get(oid)) -
                       Math.log(modelExpects[PID].get(oid))));
      return true;
    }
  };

  // 新增方法。作用？处理带平滑时的参数更新？？？？
  private TIntDoubleProcedure updateParamsWithSmoothing = new
      TIntDoubleProcedure() {
    public boolean execute(int oid, double arg) {
      double x = 0.0;
      double x0 = 0.0;
      double tmp;
      double f;
      double fp;
      for (int i = 0; i < 50; i++) {
        // check what domain these parameters are in
        tmp = modelExpects[PID].get(oid) * Math.exp(constant * x0);
        f = tmp + (arg + x0) / sigma - observedExpects[PID].get(oid);
        fp = tmp * constant + 1 / sigma;
        if (fp == 0) {
          break;
        }
        x = x0 - f / fp;
        if (Math.abs(x - x0) < 0.000001) {
          x0 = x;
          break;
        }
        x0 = x;
      }
      params[PID].put(oid, arg + x0);
      return true;
    }
  };

  /**
   * Creates a new <code>GISTrainer</code> instance which does
   * not print progress messages about training to STDOUT.
   * 建立一个不输出信息的GISTrainer的实例
   */

  GISTrainer() {
    super();
  }

  /**
   * Creates a new <code>GISTrainer</code> instance.
   *   * @param printMessages sends progress messages about training to
   *                      STDOUT when true; trains silently otherwise.
   */
  GISTrainer(boolean printMessages) {
    this();
    this.printMessages = printMessages;
  }

  /**
   * Sets whether this trainer will use smoothing while training the model.
   * This can improve model accuracy, though training will potentially take
   * longer and use more memory.  Model size will also be larger.
   * 设置是否使用平滑
   * @param smooth true if smoothing is desired, false if not
   */
  public void setSmoothing(boolean smooth) {
    _simpleSmoothing = smooth;
  }

  /**
   * Sets whether this trainer will use smoothing while training the model.
   * This can improve model accuracy, though training will potentially take
   * longer and use more memory.  Model size will also be larger.
   *
   * @param timesSeen the "number" of times we want the trainer to imagine
   *                  it saw a feature that it actually didn't see
   * 对没有出现的特征,设置默认出现次数为timeSeen
   */
  public void setSmoothingObservation(double timesSeen) {
    _smoothingObservation = timesSeen;
  }

  public GISModel trainModel(EventStream eventStream, int iterations,
                             int cutoff) {
    return trainModel(iterations, new OnePassDataIndexer(eventStream, cutoff));
  }

  /**
   * Train a model using the GIS algorithm.
   * 用GIS算法训练得到一个GISModel,这是算法的核心部分!!!
   * @param eventStream The EventStream holding the data on which this model
   *                    will be trained.
   * @param iterations  The number of GIS iterations to perform.
   * @param cutoff      The number of times a predicate must be seen in order
   *                    to be relevant for training.
   * @param di The data indexer used to compress events in memory.
   * @return The newly trained model, which can be used immediately or saved
   *         to disk using an opennlp.maxent.io.GISModelWriter object.
   */
  public GISModel trainModel(int iterations, DataIndexer di) {
    /************** Incorporate all of the needed info ******************/
    /* 数据整理 */
    display("Incorporating indexed data for training...  \n"); //输出处理信息
    contexts = di.getContexts();
    weights = di.getWeights();
    outcomes = di.getOutcomeList();
    numTimesEventsSeen = di.getNumTimesEventsSeen();
    numTokens = contexts.length;

    //printTable(contexts);
    // determine the correction constant and its inverse
    /* 调整常量及其倒数,表示所有事件中最大的特征数.GIS算法要求每个事件的特征函数之和为常数,
     *  不满足此条件时,选择所有特征函数之和的最大值,引入一个修正特征correctionfeature */
    // constant = contexts[0].length;
    constant = 0.0;
    weightLabels = di.getWeightLabels();
    double tmpConstant = 0.0;

    for (TID = 0; TID < contexts.length; TID++) {
      for (int i = 0; i < weights[TID].length; i++)
        tmpConstant += weightLabels[weights[TID][i]];
      if (tmpConstant > constant)
        constant = tmpConstant;
      tmpConstant = 0;
    }
    constantInverse = 1.0 / constant;

    display("done.\n");
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -