📄 expgain.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. *//** The "gain" obtained by adding a feature to a conditional exponential model. Based on the joint exponential model in Della Pietra, Della Pietra & Lafferty, 1997. We smooth using a Gaussian prior. Note that we use Math.log(), not log-base-2, so the units are not "bits". @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */package edu.umass.cs.mallet.base.types;import edu.umass.cs.mallet.base.classify.Classification;import edu.umass.cs.mallet.base.util.MalletLogger;import java.util.logging.*;import java.io.*;public class ExpGain extends RankedFeatureVector{ private static Logger logger = MalletLogger.getLogger(ExpGain.class.getName()); // ExpGain of a feature, f, is defined in terms of MaxEnt-type feature+class "Feature"s, F, // F = f,c // ExpGain of a Feature, F, is // G(F) = KL(p~[C]||q[C]) - KL(p~[C]||q_F[C]) // where p~[] is the empirical distribution, according to the true class label distribution // and q[] is the distribution from the (imperfect) classifier // and q_F[] is the distribution from the (imperfect) classifier with F added // and F's weight adjusted (but none of the other weights adjusted) // ExpGain of a feature,f, is // G(f) = sum_c G(f,c) // It would be more accurate to return a gain number for each "feature/class" combination, // but here we simply return the gain(feature) = \sum_{class} gain(feature,class). // xxx Not ever used. Remove them. boolean usingHyperbolicPrior = false; double hyperbolicSlope = 0.2; double hyperbolicSharpness = 10.0; private static double[] calcExpGains (InstanceList ilist, LabelVector[] classifications, double gaussianPriorVariance) { int numInstances = ilist.size(); int numClasses = ilist.getTargetAlphabet().size(); int numFeatures = ilist.getDataAlphabet().size(); assert (ilist.size() > 0); // Notation from Della Pietra & Lafferty 1997, p.4 // "p~" double[][] p = new double[numClasses][numFeatures]; // "q" double[][] q = new double[numClasses][numFeatures]; // "alpha", the weight of the new feature double[][] alphas = new double[numClasses][numFeatures]; int fli; // feature location index double flv; // feature location value logger.info ("Starting klgains, #instances="+numInstances); double trueLabelWeightSum = 0; double modelLabelWeightSum = 0; // Calculate p~[f] and q[f] for (int i = 0; i < numInstances; i++) { assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet()); Instance inst = ilist.getInstance(i); Labeling labeling = inst.getLabeling (); FeatureVector fv = (FeatureVector) inst.getData (); //double instanceWeight = ilist.getInstanceWeight(i); // The code below relies on labelWeights summing to 1 over all labels! double perInstanceModelLabelWeight = 0; for (int li = 0; li < numClasses; li++) { double trueLabelWeight = labeling.value (li); double modelLabelWeight = classifications[i].value(li); trueLabelWeightSum += trueLabelWeight; modelLabelWeightSum += modelLabelWeight; perInstanceModelLabelWeight += modelLabelWeight; //if (i < 500) System.out.println ("i="+i+" li="+li+" true="+trueLabelWeight+" model="+modelLabelWeight); if (trueLabelWeight == 0 && modelLabelWeight == 0) continue; for (int fl = 0; fl < fv.numLocations(); fl++) { fli = fv.indexAtLocation(fl); assert (fv.valueAtLocation(fl) == 1.0); // xxx Note that we are not attenting to instanceWeight here! //p[li][fli] += trueLabelWeight * instanceWeight / (numInstances+1); //q[li][fli] += modelLabelWeight * instanceWeight / (numInstances+1); p[li][fli] += trueLabelWeight; q[li][fli] += modelLabelWeight; } } assert (Math.abs (perInstanceModelLabelWeight - 1.0) < 0.001); } assert (Math.abs (trueLabelWeightSum/numInstances - 1.0) < 0.001) : "trueLabelWeightSum should be 1.0, it was "+trueLabelWeightSum; assert (Math.abs (modelLabelWeightSum/numInstances - 1.0) < 0.001) : "modelLabelWeightSum should be 1.0, it was "+modelLabelWeightSum;/* double psum = 0; double qsum = 0; for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { psum += p[i][j]; qsum += q[i][j]; } assert (Math.abs(psum - 1.0) < 0.0001) : "psum not 1.0! psum="+psum+" qsum="+qsum; assert (Math.abs(qsum - 1.0) < 0.0001) : "qsum not 1.0! psum="+psum+" qsum="+qsum;*/ // Determine the alphas // We can't do it in closed form as in the Della Pietra paper, because this we have here // a conditional MaxEnt model. // So we do it by Newton-Raphson... // ...initializing by the broken, inappropriate joint-case closed-form solution: //for (int i = 0; i < numClasses; i++) //for (int j = 0; j < numFeatures; j++) //alphas[i][j] = Math.log ( (p[i][j]*(1.0-q[i][j])) / (q[i][j]*(1.0-p[i][j])) ); double[][] dalphas = new double[numClasses][numFeatures]; // first derivative double[][] alphaChangeOld = new double[numClasses][numFeatures]; // change in alpha, last iteration double[][] alphaMax = new double[numClasses][numFeatures]; // change in alpha, last iteration double[][] alphaMin = new double[numClasses][numFeatures]; // change in alpha, last iteration double[][] ddalphas = new double[numClasses][numFeatures];// second derivative for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { alphaMax[i][j] = Double.POSITIVE_INFINITY; alphaMin[i][j] = Double.NEGATIVE_INFINITY; } double maxAlphachange = 0; double maxDalpha = 99; int maxNewtonSteps = 50; // xxx Change to more? // alphas[][] are initialized to zero for (int newton = 0; maxDalpha > 1.0E-8 && newton < maxNewtonSteps; newton++) { //System.out.println ("Newton iteration "+newton); if (false /*usingHyperbolicPrior*/) { for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { dalphas[i][j] = p[i][j] - (alphas[i][j] / gaussianPriorVariance); ddalphas[i][j] = -1 / gaussianPriorVariance; } } else { // Gaussian prior for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { dalphas[i][j] = p[i][j] - (alphas[i][j] / gaussianPriorVariance); ddalphas[i][j] = -1 / gaussianPriorVariance; } } for (int i = 0; i < ilist.size(); i++) { assert (classifications[i].getLabelAlphabet() == ilist.getTargetAlphabet()); Instance inst = ilist.getInstance(i); Labeling labeling = inst.getLabeling (); FeatureVector fv = (FeatureVector) inst.getData (); // xxx This assumes binary-valued features. What about "tied" weights? for (int fl = 0; fl < fv.numLocations(); fl++) { fli = fv.indexAtLocation(fl); for (int li = 0; li < numClasses; li++) { double modelLabelWeight = classifications[i].value(li); double expalpha = Math.exp (alphas[li][fli]); double numerator = modelLabelWeight * expalpha; double denominator = numerator + (1.0 - modelLabelWeight); dalphas[li][fli] -= numerator / denominator; ddalphas[li][fli] += ((numerator*numerator) / (denominator*denominator) - (numerator/denominator)); } } } // We now now first- and second-derivative for this newton step // Run tests on the alphas and their derivatives, and do a newton step double alphachange, newalpha, oldalpha; maxAlphachange = maxDalpha = 0; for (int i = 0; i < numClasses; i++) for (int j = 0; j < numFeatures; j++) { alphachange = - (dalphas[i][j] / ddalphas[i][j]); if (p[i][j] == 0 && q[i][j] == 0) continue; else if (false && (i*numFeatures+j) % (numClasses*numFeatures/2000) == 0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -