📄 mcmaxenttrainer.java
字号:
this.theClassifier = initialClassifier; this.parameters = theClassifier.parameters; this.featureSelection = theClassifier.featureSelection; this.perLabelFeatureSelection = theClassifier.perClassFeatureSelection; this.defaultFeatureIndex = theClassifier.defaultFeatureIndex; assert (initialClassifier.getInstancePipe() == ilist.getPipe()); } else if (this.theClassifier == null) { this.theClassifier = new MCMaxEnt (ilist.getPipe(), parameters, featureSelection, perLabelFeatureSelection); } cachedValueStale = true; cachedGradientStale = true; // Initialize the constraints InstanceList.Iterator iter = trainingList.iterator (); logger.fine("Number of instances in training list = " + trainingList.size()); while (iter.hasNext()) { double instanceWeight = iter.getInstanceWeight(); Instance inst = iter.nextInstance(); Labeling labeling = inst.getLabeling (); //logger.fine ("Instance "+ii+" labeling="+labeling); FeatureVector fv = (FeatureVector) inst.getData (); Alphabet fdict = fv.getAlphabet(); assert (fv.getAlphabet() == fd); int li = labeling.getBestIndex(); // The "2*" below is because there is one copy for the p(y|x)and another for the p(x|y). MatrixOps.rowPlusEquals (constraints, numFeatures, li, fv, 2*instanceWeight); // For the default feature, whose weight is 1.0 assert(!Double.isNaN(instanceWeight)) : "instanceWeight is NaN"; assert(!Double.isNaN(li)) : "bestIndex is NaN"; boolean hasNaN = false; for(int i = 0; i < fv.numLocations(); i++) { if(Double.isNaN(fv.valueAtLocation(i))) { logger.info("NaN for feature " + fdict.lookupObject(fv.indexAtLocation(i)).toString()); hasNaN = true; } } if(hasNaN) logger.info("NaN in instance: " + inst.getName()); // Only p(y|x) uses the default feature; p(x|y) doesn't use it. The default feature value is 1.0. constraints[li*numFeatures + defaultFeatureIndex] += instanceWeight; } //TestMaximizable.testValueAndGradientCurrentParameters (this); } public MCMaxEnt getClassifier () { return theClassifier; } public double getParameter (int index) { return parameters[index]; } public void setParameter (int index, double v) { cachedValueStale = true; cachedGradientStale = true; parameters[index] = v; } public int getNumParameters() { return parameters.length; } public void getParameters (double[] buff) { if (buff == null || buff.length != parameters.length) buff = new double [parameters.length]; System.arraycopy (parameters, 0, buff, 0, parameters.length); } public void setParameters (double [] buff) { assert (buff != null); cachedValueStale = true; cachedGradientStale = true; if (buff.length != parameters.length) parameters = new double[buff.length]; System.arraycopy (buff, 0, parameters, 0, buff.length); } // log probability of the training labels public double getValue () { if (cachedValueStale) { numGetValueCalls++; cachedValue = 0; // We'll store the expectation values in "cachedGradient" for now cachedGradientStale = true; java.util.Arrays.fill (cachedGradient, 0.0); // Incorporate likelihood of data double[] scores = new double[trainingList.getTargetAlphabet().size()]; double value = 0.0; //System.out.println("I Now "+inputAlphabet.size()+" regular features."); InstanceList.Iterator iter = trainingList.iterator(); //int ii = 0; // Normalize the parameters to be per-class multinomials double probs[][] = new double[scores.length][numFeatures]; double lprobs[][] = new double[scores.length][numFeatures]; for (int si = 0; si < scores.length; si++) { double sum = 0, max = MatrixOps.max (parameters); for (int fi = 0; fi < numFeatures; fi++) { // TODO Strongly consider some smoothing here. What happens when all parameters are zero? // Oh, this should be no problem, because exp(0) == 1. probs[si][fi] = Math.exp(parameters[si*numFeatures+fi] - max); sum += probs[si][fi]; } assert (sum > 0); for (int fi = 0; fi < numFeatures; fi++) { probs[si][fi] /= sum; lprobs[si][fi] = Math.log(probs[si][fi]); } } while (iter.hasNext()) { double instanceWeight = iter.getInstanceWeight(); Instance instance = iter.nextInstance(); Labeling labeling = instance.getLabeling (); //System.out.println("L Now "+inputAlphabet.size()+" regular features."); this.theClassifier.getClassificationScores (instance, scores); FeatureVector fv = (FeatureVector) instance.getData (); int li = labeling.getBestIndex(); value = - (instanceWeight * Math.log (scores[li])); if(Double.isNaN(value)) { logger.fine ("MCMaxEntTrainer: Instance " + instance.getName() + "has NaN value. log(scores)= " + Math.log(scores[li]) + " scores = " + scores[li] + " has instance weight = " + instanceWeight); } if (Double.isInfinite(value)) { logger.warning ("Instance "+instance.getSource() + " has infinite value; skipping value and gradient"); cachedValue -= value; cachedValueStale = false; return -value;// continue; } cachedValue += value; // CPAL - this is a loop over classes and their scores // - we compute the gradient by taking the dot product of the feature value // and the probability of the class for (int si = 0; si < scores.length; si++) { if (scores[si] == 0) continue; assert (!Double.isInfinite(scores[si])); // CPAL - accumulating the current classifiers expectation of the feature // vector counts for this class label // Current classifier has expectation over class label, not over feature vector MatrixOps.rowPlusEquals (cachedGradient, numFeatures, si, fv, -instanceWeight * scores[si]); cachedGradient[numFeatures*si + defaultFeatureIndex] += (-instanceWeight * scores[si]); } // CPAL - if we wish to do multiconditional training we need another term for this accumulated // expectation if (usingMultiConditionalTraining) { // need something analogous to this // this.theClassifier.getClassificationScores (instance, scores); // this.theClassifier.getFeatureDistributions (instance, // Note: li is the "label" for this instance // Get the sum of the feature vector // which is the number of counts for the document if we use that as input double Ncounts = MatrixOps.sum(fv); // CPAL - get the additional term for the value of our - log probability // - this computation amounts to the dot product of the feature vector and the probability vector cachedValue -= (instanceWeight * fv.dotProduct(lprobs[li])); // CPAL - get the model expectation over features for the given class for (int fi = 0; fi < numFeatures; fi++) { //if(parameters[numFeatures*li + fi] != 0) { // MatrixOps.rowPlusEquals(cachedGradient, numFeatures,li,fv,)) cachedGradient[numFeatures*li + fi] += (-instanceWeight * Ncounts * probs[li][fi]); // } } } } //logger.info ("-Expectations:"); cachedGradient.print(); // Incorporate prior on parameters if (usingHyperbolicPrior) { for (int li = 0; li < numLabels; li++) for (int fi = 0; fi < numFeatures; fi++) cachedValue += (hyperbolicPriorSlope / hyperbolicPriorSharpness * Math.log (Maths.cosh (hyperbolicPriorSharpness * parameters[li *numFeatures + fi]))); } else { for (int li = 0; li < numLabels; li++) for (int fi = 0; fi < numFeatures; fi++) { double param = parameters[li*numFeatures + fi]; cachedValue += param * param / (2 * gaussianPriorVariance); } } cachedValue *= -1.0; // MAXIMIZE, NOT MINIMIZE cachedValueStale = false; progressLogger.info ("Value (loglikelihood) = "+cachedValue); } return cachedValue; } // CPAL first get value, then gradient public void getValueGradient (double [] buffer) { // Gradient is (constraint - expectation - parameters/gaussianPriorVariance) if (cachedGradientStale) { numGetValueGradientCalls++; if (cachedValueStale) // This will fill in the cachedGradient with the "-expectation" getValue (); // cachedGradient contains the negative expectations // expectations are model expectations and constraints are // empirical expectations MatrixOps.plusEquals (cachedGradient, constraints); // CPAL - we need a second copy of the constraints // - actually, we only want this for the feature values // - I've moved this up into getValue //if (usingMultiConditionalTraining){ // MatrixOps.plusEquals(cachedGradient, constraints); //} // Incorporate prior on parameters if (usingHyperbolicPrior) { throw new UnsupportedOperationException ("Hyperbolic prior not yet implemented."); } else { MatrixOps.plusEquals (cachedGradient, parameters, -1.0 / gaussianPriorVariance); } // A parameter may be set to -infinity by an external user. // We set gradient to 0 because the parameter's value can // never change anyway and it will mess up future calculations // on the matrix, such as norm(). MatrixOps.substitute (cachedGradient, Double.NEGATIVE_INFINITY, 0.0); // Set to zero all the gradient dimensions that are not among the selected features if (perLabelFeatureSelection == null) { for (int labelIndex = 0; labelIndex < numLabels; labelIndex++) MatrixOps.rowSetAll (cachedGradient, numFeatures, labelIndex, 0.0, featureSelection, false); } else { for (int labelIndex = 0; labelIndex < numLabels; labelIndex++) MatrixOps.rowSetAll (cachedGradient, numFeatures, labelIndex, 0.0, perLabelFeatureSelection[labelIndex], false); } cachedGradientStale = false; } assert (buffer != null && buffer.length == parameters.length); System.arraycopy (cachedGradient, 0, buffer, 0, cachedGradient.length); } public double sumNegLogProb (double a, double b) { if (a == Double.POSITIVE_INFINITY && b == Double.POSITIVE_INFINITY) return Double.POSITIVE_INFINITY; else if (a > b) return b - Math.log (1 + Math.exp(b-a)); else return a - Math.log (1 + Math.exp(a-b)); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -