📄 gistrainer.java~94~

📁 垃圾邮件过滤器源代码
💻 JAVA~94~
📖 第 1 页 / 共 2 页
字号:
上一页 12
    outcomeLabels = di.getOutcomeLabels(); // 得到输出结果的字符串形式
    //outcomeList = di.getOutcomeList();
    numOutcomes = outcomeLabels.length;
    iprob = Math.log(1.0 / numOutcomes); // 对每个输出结果的概率,平均概率

    predLabels = di.getPredLabels(); // 得到断言的字符串形式
    numPreds = predLabels.length; // 得到断言的个数

    display("\tNumber of Event Tokens: " + numTokens + "\n");
    display("\t    Number of Outcomes: " + numOutcomes + "\n");
    display("\t  Number of Predicates: " + numPreds + "\n");

    // 建立特征数组,第一维表示断言(整数形式),第二维表示输出结果
    // 数组元素表示断言在各种输出结果中出现的次数(在整个训练集上的统计信息)
    // 计算:num(ai,bj),形成特征-输出矩阵
    double[][] predCount = new double[numPreds][numOutcomes];
    for (TID = 0; TID < numTokens; TID++) {
      for (int j = 0; j < contexts[TID].length; j++) {
        predCount[contexts[TID][j]][outcomes[TID]]
            += numTimesEventsSeen[TID] * weightLabels[weights[TID][j]];
      }
    }
    di = null; // don't need it anymore

    // A fake "observation" to cover features which are not detected in the data.
    // The default is to assume that we observed "1/10th" of a feature during training.
    // 对没有出现的特征,假设出现次数为logSmoothingObservation(0.1)以平滑
    final double smoothingObservation = _smoothingObservation;

    // Get the observed expectations of the features. Strictly speaking,
    // we should divide the counts by the number of Tokens, but because of
    // the way the model's expectations are approximated in the
    // implementation, this is cancelled out when we compute the next
    // iteration of a parameter, making the extra divisions wasteful.
    // 得到特征的经验期望值.严格来讲,应该把出现次数除以事件数,但因为实现过程中
    // 模型的期望值是近似的,进行参数的迭代时会被抵偿掉,会使除法浪费,因此没有除法过程??

    // 统计针对每个特征进行,因此所有的数据结构大小都是特征的个数
    // HASH数组,每个数组元素都是一个HASH表
    params = new TIntParamHashMap[numPreds];
    modelExpects = new TIntParamHashMap[numPreds];
    observedExpects = new TIntParamHashMap[numPreds];

    // 为生成HASH映射准备????(不清楚此段代码的作用!!!)
    int initialCapacity;
    float loadFactor = (float) 0.9;
    if (numOutcomes < 3) {
      initialCapacity = 2;
      loadFactor = (float) 1.0;
    }
    else if (numOutcomes < 5) {
      initialCapacity = 2;
    }
    else {
      initialCapacity = (int) numOutcomes / 2;
    }
    // 处理每个特征,由上面得到的特征数组,得到其经验期望值.
    for (PID = 0; PID < numPreds; PID++) {
      // 生成保存此特征的信息需要的数据结构HASH表
      params[PID] = new TIntParamHashMap(initialCapacity, loadFactor); // 参数表
      modelExpects[PID] = new TIntParamHashMap(initialCapacity, loadFactor); // 模型期望值
      observedExpects[PID] = new TIntParamHashMap(initialCapacity, loadFactor); // 经验期望值
      // 根据此特征在各种输出中的出现情况,为统计其期望值做初始化
      for (OID = 0; OID < numOutcomes; OID++) {
        if (predCount[PID][OID] > 0) { // (ai,bj)出现过,则将其相关参数插入经验期望HASH表,并初始化模型和参数HASH表
          params[PID].put(OID, 0.0);
          modelExpects[PID].put(OID, 0.0);
          observedExpects[PID].put(OID, predCount[PID][OID]);  //????
          // observedExpects[PID].put(OID,Math.log(predCount[PID][OID]));  这是旧版本的处理方法!
        }
        else if (_simpleSmoothing) { // 没有出现,则根据平滑设置处理
          params[PID].put(OID, 0.0);
          modelExpects[PID].put(OID, 0.0);
          observedExpects[PID].put(OID, smoothingObservation);
        }
      }
      // 压缩相关HASH表
      params[PID].compact();
      modelExpects[PID].compact();
      observedExpects[PID].compact();
    }

    // compute the expected value of correction 计算修正特征的期望值
    if (_useSlackParameter) { // _useSlackParameter是新引入的参数,作用??
      int cfvalSum = 0;
      double curPredWeight = 0.0;
      double curTokenSumWeight = 0.0;
      for (TID = 0; TID < numTokens; TID++) { // 每个事件包含的特征都包含在contexts数组中
        for (int j = 0; j < contexts[TID].length; j++) { //当前事件的每个特征;
          PID = contexts[TID][j];
          curPredWeight = weightLabels[weights[TID][j]];
          curTokenSumWeight +=curPredWeight;
          if (!modelExpects[PID].containsKey(outcomes[TID]))  // HASH表中不包含,为修正特征??
            cfvalSum += numTimesEventsSeen[TID] * curPredWeight;
        }
        // C-所有事件的特征函数之和 =  修正特征函数f(l)
        cfvalSum += (constant - curTokenSumWeight) * numTimesEventsSeen[TID];
        curTokenSumWeight = 0.0;
      }
      if (cfvalSum > 0 && cfvalSum < NEAR_ZERO) {
        cfObservedExpect = Math.log(NEAR_ZERO); //nearly zero so log is defined
      }
      else {
        // 上面的期望值初始化过程中旧版本用LOG,新版本中没有用LOG,此处要不要修改?????
        cfObservedExpect = Math.log(cfvalSum); // 修正特征的期望值
      }
      correctionParam = 0.0; // 修正特征对应的参数
    }
    predCount = null; // don't need it anymore

    display("...done.\n");
    // 所有的初始化完毕,要进行迭代以得到模型参数

    // 模型分布,估计一个事件对每个输出的概率
    modelDistribution = new double[numOutcomes];
    // 事件特征集对应每个输出的特征个数
    numfeats = new int[numOutcomes];

    /***************** Find the parameters ************************/
    display("Computing model parameters...\n");
    findParameters(iterations); // 进行迭代以得到模型参数

    /*************** Create and return the model ******************/
    return new GISModel(params, predLabels, outcomeLabels, (int)constant,
                        correctionParam);

  }

  /* Estimate and return the model parameters. 估计并返回模型参数(写入全局数组)*/
  private void findParameters(int iterations) {
    double prevLL = 0.0; // 上次迭代得到的log-likelihood值
    double currLL = 0.0; // 此次迭代得到的log-likelihood值
    display("Performing " + iterations + " iterations.\n");
    for (int i = 1; i <= iterations; i++) { // 调整输出格式
      if (i < 10) {
        display("  " + i + ":  ");
      }
      else if (i < 100) {
        display(" " + i + ":  ");
      }
      else {
        display(i + ":  ");

      }
      currLL = nextIteration(); // 进行迭代,得到新的参数

      // 如果迭代过程中log-likelihood值减小,则出错,模型不再收敛,应中止.
      // 或者两次迭代的差值小于阈值,可以中止(不必再迭代100次)
      if (i > 1) {
        if (prevLL > currLL) {
          System.err.println("Model Diverging: loglikelihood decreased");
          break;
        }
        if (currLL - prevLL < LLThreshold) {
          break;
        }
      }
      // 把当前得到的参数保存,以便进行下一次迭代时与新的参数值进行比较
      prevLL = currLL;
    }

    // 清理不要需要的数据结构
    observedExpects = null;
    modelExpects = null;
    numTimesEventsSeen = null;
    contexts = null;
  }

  /**
   * Use this model to evaluate a context and return an array of the
   * likelihood of each outcome given that context.
   * 用现有的模型,根据给定的上下文(事件),得到一个此上下文对每一个输出的likelihood值数组
   * @param context The integers of the predicates which have been
   *                observed at the present decision point.
   * 目前事件使用的断言(特征)的整数表示(索引号)
   * @return        The normalized probabilities for the outcomes given the
   *                context. The indexes of the double[] are the outcome
   *                ids, and the actual string representation of the
   *                outcomes can be obtained from the method getOutcome(int i).
   * 返回给定上下文对所有输出的归一化概率值.用输出结果的ID表示,
   * 实际字符串可以用getOutcome(int i)方法得到(结果写入数组)
   */
  public void eval(int[] context, int [] weight, double[] outsums) {
    // 初始化,此上下文对所有输出具有相同的概率iprob,
    // outsums数组元素表示当前上下文对应每个输出的分布
    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] = iprob;
      numfeats[oid] = 0;
    }

    int[] activeOutcomes;
    for (int i = 0; i < context.length; i++) { // context.length 表示当前事件的特征的个数
      TIntParamHashMap predParams = params[context[i]]; // 得到每个特征的当前参数的HASH表
      activeOutcomes = predParams.keys(); // 单步调试说明:此数组元素为出现当前特征的输出结果集
      for (int j = 0; j < activeOutcomes.length; j++) { // 所有出现此特征的输出结果集A
        int oid = activeOutcomes[j];
        numfeats[oid]++; // 计算修正参数时使用
        outsums[oid] += constantInverse * predParams.get(oid) * weightLabels[weight[i]];
        /* 到此处时,outsums实现了李荣陆文档中公式10中exp()中指数部分的计算,
         *与上面 private TIntDoubleProcedure updateParams结合,此处的coustantInverse为1/C,
         */
      }
    }

    double SUM = 0.0; // 当前事件对所有可能的输出的概率之和,为了做归一化
    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] = Math.exp(outsums[oid]); // Adwait文档中P14的最后一行的改写形式
      // 实现李荣陆文档中的公式10,与Adwait文档中P14的最后一行的作用完全相同
      // 此处特征函数值为1,修改特征函数将涉及此处的修改
      if (_useSlackParameter) {
        outsums[oid] +=
            ( (1.0 - ( (double) numfeats[oid] / constant)) * correctionParam);
      }
      SUM += outsums[oid];
    }
    // 归一化的概率值,返回
    for (int oid = 0; oid < numOutcomes; oid++) {
      outsums[oid] /= SUM;
    }
  }

  /* Compute one iteration of GIS and retutn log-likelihood.*/
  // 执行一次GIS迭代,返回log-likelihood值.
  private double nextIteration() {
    // compute contribution of p(a|b_i) for each feature and the new correction parameter
    // 计算等每个特征对输出的影响,并得到新的修正参数
    double loglikelihood = 0.0;
    CFMOD = 0.0;
    int numEvents = 0;
    int numCorrect = 0; // 正确的个数
    for (TID = 0; TID < numTokens; TID++) {
      // TID, modeldistribution and PID are globals used in
      // the updateModelExpects procedure.  They need to be set.
      // 重新设置,过程返回后modelDistribution为此事件当前对各种输出的概率,迭代过程使其更新
      eval(contexts[TID], weights[TID],modelDistribution);

      for (int j = 0; j < contexts[TID].length; j++) { // 当前事件的每个特征
        PID = contexts[TID][j];
        WID = weights[TID][j];
        modelExpects[PID].forEachEntry(updateModelExpect); // 更新模型期望值

        if (_useSlackParameter) { // 处理修正特征
          for (OID = 0; OID < numOutcomes; OID++) {
            if (!modelExpects[PID].containsKey(OID)) {
              CFMOD += modelDistribution[OID] * numTimesEventsSeen[TID];
            }
          }
        }
      }
      if (_useSlackParameter) {
        CFMOD += (constant - contexts[TID].length) * numTimesEventsSeen[TID];

        // 最大似然值估计
      }
      loglikelihood += Math.log(modelDistribution[outcomes[TID]]) * numTimesEventsSeen[TID];
      numEvents += numTimesEventsSeen[TID];

      // 计算预测正确的事件数,根据显示设置,输出正确的比率,以直观地观察参数训练的效果
      if (printMessages) {
        int max = 0;
        for (OID = 1; OID < numOutcomes; OID++) {
          if (modelDistribution[OID] > modelDistribution[max]) {
            max = OID;
          }
        }
        if (max == outcomes[TID]) {
          numCorrect += numTimesEventsSeen[TID];
        }
      }

    }
    display(".");

    // compute the new parameter values
    // 计算新的参数值
    for (PID = 0; PID < numPreds; PID++) {
      params[PID].forEachEntry(updateParams);
      modelExpects[PID].transformValues(backToZeros); // re-initialize to 0.0's
    }
    if (CFMOD > 0.0 && _useSlackParameter) {
      correctionParam += (cfObservedExpect - Math.log(CFMOD));

    }
    display(". loglikelihood=" + loglikelihood + "\t" +
            ( (double) numCorrect / numEvents) + "\n");
    return (loglikelihood);
  }

  private void display(String s) {
    if (printMessages) {
      System.out.print(s);
    }
  }

}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -