📄 infogain.java

📁 常用机器学习算法,java编写源代码,内含常用分类算法,包括说明文档
💻 JAVA
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. *//**	 Information gain of the absence/precense of each feature.	 Note that we aren't attending to the feature's value, and MALLET doesn't currently	 have any support at all for categorical features.   @author Andrew McCallum <a href="mailto:mccallum@cs.umass.edu">mccallum@cs.umass.edu</a> */package edu.umass.cs.mallet.base.types;public class InfoGain extends RankedFeatureVector{	// xxx This is DISGUSTINGLY non-thread-safe.	static double staticBaseEntropy;	static LabelVector staticBaseLabelDistribution;	// xxx Yuck.  Figure out how to remove this.	// Not strictly part of a list of feature info gains, but convenient and efficient	// for ml.classify.DecisionTree	double baseEntropy;	LabelVector baseLabelDistribution;		private static double[] calcInfoGains (InstanceList ilist)	{		final double log2 = Math.log(2);		int numInstances = ilist.size();		int numClasses = ilist.getTargetAlphabet().size();		int numFeatures = ilist.getDataAlphabet().size();		double[] infogains = new double[numFeatures];		double[][] targetFeatureCount = new double[numClasses][numFeatures];		double[] featureCountSum = new double[numFeatures];		double[] targetCount = new double[numClasses];		double targetCountSum = 0;		double flv;	// feature location value		int fli; // feature location index		double count;		// Populate targetFeatureCount, et al		for (int i = 0; i < ilist.size(); i++) {			Instance inst = ilist.getInstance(i);			Labeling labeling = inst.getLabeling ();			FeatureVector fv = (FeatureVector) inst.getData ();			double instanceWeight = ilist.getInstanceWeight(i);			// The code below relies on labelWeights summing to 1 over all labels!			double labelWeightSum = 0;			for (int ll = 0; ll < labeling.numLocations(); ll++) {				int li = labeling.indexAtLocation (ll);				double labelWeight = labeling.valueAtLocation (ll);				labelWeightSum += labelWeight;				if (labelWeight == 0) continue;				count = labelWeight * instanceWeight;				for (int fl = 0; fl < fv.numLocations(); fl++) {					fli = fv.indexAtLocation(fl);					// xxx Is this right?  What should we do about negative values?					// Whatever is decided here should also go in DecisionTree.split()					if (fv.valueAtLocation(fl) > 0) {						targetFeatureCount[li][fli] += count;						featureCountSum[fli] += count;					}				}				targetCount[li] += count;				targetCountSum += count;			}			assert (Math.abs (labelWeightSum - 1.0) < 0.0001);		}		if (targetCountSum == 0) {			staticBaseEntropy = 0.0;					// xxx Should this instead by infinite?			staticBaseLabelDistribution = new LabelVector ((LabelAlphabet)ilist.getTargetAlphabet(), targetCount);			return infogains;		}		assert (targetCountSum > 0) : targetCountSum;		double p;		double[] classDistribution = new double[numClasses];		// Calculate the overall entropy of the labels, ignoring the features		staticBaseEntropy = 0;		//System.out.print ("targetCount "); Vector.print (targetCount);		//System.out.println ("targetCountSum = "+targetCountSum);		for (int li = 0; li < numClasses; li++) {			p = targetCount[li]/targetCountSum;			classDistribution[li] = p;			assert (p <= 1.0) : p;			if (p != 0)				staticBaseEntropy -= p * Math.log(p) / log2;		}		staticBaseLabelDistribution = new LabelVector ((LabelAlphabet)ilist.getTargetAlphabet(), classDistribution);		//System.out.println ("Total class entropy = "+staticBaseEntropy);		// Calculate the InfoGain of each feature		for (int fi = 0; fi < numFeatures; fi++) {			double featurePresentEntropy = 0;			double norm = featureCountSum[fi];			if (norm > 0) {				for (int li = 0; li < numClasses; li++) {					p = targetFeatureCount[li][fi]/norm;					assert (p <= 1.00000001) : p;					if (p != 0)						featurePresentEntropy -= p * Math.log(p) / log2;				}			}			assert (!Double.isNaN(featurePresentEntropy)) : fi;			norm = targetCountSum-featureCountSum[fi];			double featureAbsentEntropy = 0;			if (norm > 0) {				for (int li = 0; li < numClasses; li++) {					p = (targetCount[li]-targetFeatureCount[li][fi])/norm;					assert (p <= 1.00000001) : p;					if (p != 0)						featureAbsentEntropy -= p * Math.log(p) / log2;				}			}			assert (!Double.isNaN(featureAbsentEntropy)) : fi;			//Alphabet dictionary = ilist.getDataAlphabet();			//System.out.println ("Feature="+dictionary.lookupSymbol(fi)+" presentWeight="			//+(featureCountSum[fi]/targetCountSum)+" absentWeight="			//+((targetCountSum-featureCountSum[fi])/targetCountSum)+" presentEntropy="			//+featurePresentEntropy+" absentEntropy="			//+featureAbsentEntropy);			infogains[fi] = (staticBaseEntropy											 - (featureCountSum[fi]/targetCountSum) * featurePresentEntropy											 - ((targetCountSum-featureCountSum[fi])/targetCountSum) * featureAbsentEntropy);			assert (!Double.isNaN(infogains[fi])) : fi;		}		return infogains;	}	public InfoGain (InstanceList ilist)	{		super (ilist.getDataAlphabet(), calcInfoGains (ilist));		baseEntropy = staticBaseEntropy;		baseLabelDistribution = staticBaseLabelDistribution;	}	public InfoGain (Alphabet vocab, double[] infogains)	{		super (vocab, infogains);	}	public double getBaseEntropy ()	{		return baseEntropy;	}	public LabelVector getBaseLabelDistribution ()	{		return baseLabelDistribution;	}	public static class Factory implements RankedFeatureVector.Factory	{		public Factory ()		{		}				public RankedFeatureVector newRankedFeatureVector (InstanceList ilist)		{			return new InfoGain (ilist);		}	}	}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -