📄 jrip.java
字号:
// If there are already antecedents existing for(int j=0; j < m_Antds.size(); j++){ Antd antdj = (Antd)m_Antds.elementAt(j); if(!antdj.getAttr().isNumeric()){ used[antdj.getAttr().index()]=true; numUnused--; } } double maxInfoGain; while (Utils.gr(growData.numInstances(), 0.0) && (numUnused > 0) && Utils.sm(defAcRt, 1.0) ){ // We require that infoGain be positive /*if(numAntds == originalSize) maxInfoGain = 0.0; // At least one condition allowed else maxInfoGain = Utils.eq(defAcRt, 1.0) ? defAccu/(double)numAntds : 0.0; */ maxInfoGain = 0.0; /* Build a list of antecedents */ Antd oneAntd=null; Instances coverData = null; Enumeration enumAttr=growData.enumerateAttributes(); int index=-1; /* Build one condition based on all attributes not used yet*/ while (enumAttr.hasMoreElements()){ Attribute att= (Attribute)(enumAttr.nextElement()); index++; if(m_Debug) System.err.println("\nOne condition: size = " + growData.sumOfWeights()); Antd antd =null; if(att.isNumeric()) antd = new NumericAntd(att); else antd = new NominalAntd(att); if(!used[index]){ /* Compute the best information gain for each attribute, it's stored in the antecedent formed by this attribute. This procedure returns the data covered by the antecedent*/ Instances coveredData = computeInfoGain(growData, defAcRt, antd); if(coveredData != null){ double infoGain = antd.getMaxInfoGain(); if(m_Debug) System.err.println("Test of \'"+antd.toString()+ "\': infoGain = "+ infoGain + " | Accuracy = " + antd.getAccuRate()+ "="+antd.getAccu() +"/"+antd.getCover()+ " def. accuracy: "+defAcRt); if(infoGain > maxInfoGain){ oneAntd=antd; coverData = coveredData; maxInfoGain = infoGain; } } } } if(oneAntd == null) break; // Cannot find antds if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage //Numeric attributes can be used more than once if(!oneAntd.getAttr().isNumeric()){ used[oneAntd.getAttr().index()]=true; numUnused--; } m_Antds.addElement(oneAntd); growData = coverData;// Grow data size is shrinking defAcRt = oneAntd.getAccuRate(); } } /** * Compute the best information gain for the specified antecedent * * @param instances the data based on which the infoGain is computed * @param defAcRt the default accuracy rate of data * @param antd the specific antecedent * @param numConds the number of antecedents in the rule so far * @return the data covered by the antecedent */ private Instances computeInfoGain(Instances instances, double defAcRt, Antd antd){ Instances data = instances; /* Split the data into bags. The information gain of each bag is also calculated in this procedure */ Instances[] splitData = antd.splitData(data, defAcRt, m_Consequent); /* Get the bag of data to be used for next antecedents */ if(splitData != null) return splitData[(int)antd.getAttrValue()]; else return null; } /** * Prune all the possible final sequences of the rule using the * pruning data. The measure used to prune the rule is based on * flag given. * * @param pruneData the pruning data used to prune the rule * @param useWhole flag to indicate whether use the error rate of * the whole pruning data instead of the data covered */ public void prune(Instances pruneData, boolean useWhole){ Instances data = pruneData; double total = data.sumOfWeights(); if(!Utils.gr(total, 0.0)) return; /* The default accurate # and rate on pruning data */ double defAccu=computeDefAccu(data); if(m_Debug) System.err.println("Pruning with " + defAccu + " positive data out of " + total + " instances"); int size=m_Antds.size(); if(size == 0) return; // Default rule before pruning double[] worthRt = new double[size]; double[] coverage = new double[size]; double[] worthValue = new double[size]; for(int w=0; w<size; w++){ worthRt[w]=coverage[w]=worthValue[w]=0.0; } /* Calculate accuracy parameters for all the antecedents in this rule */ double tn = 0.0; // True negative if useWhole for(int x=0; x<size; x++){ Antd antd=(Antd)m_Antds.elementAt(x); Attribute attr= antd.getAttr(); Instances newData = data; data = new Instances(newData, 0); // Make data empty for(int y=0; y<newData.numInstances(); y++){ Instance ins=newData.instance(y); if(antd.covers(ins)){ // Covered by this antecedent coverage[x] += ins.weight(); data.add(ins); // Add to data for further pruning if((int)ins.classValue() == (int)m_Consequent) // Accurate prediction worthValue[x] += ins.weight(); } else if(useWhole){ // Not covered if((int)ins.classValue() != (int)m_Consequent) tn += ins.weight(); } } if(useWhole){ worthValue[x] += tn; worthRt[x] = worthValue[x] / total; } else // Note if coverage is 0, accuracy is 0.5 worthRt[x] = (worthValue[x]+1.0)/(coverage[x]+2.0); } double maxValue = (defAccu+1.0)/(total+2.0); int maxIndex = -1; for(int i=0; i<worthValue.length; i++){ if(m_Debug){ double denom = useWhole ? total : coverage[i]; System.err.println(i+"(useAccuray? "+!useWhole+"): " + worthRt[i] + "="+worthValue[i]+ "/"+denom); } if(worthRt[i] > maxValue){ // Prefer to the maxValue = worthRt[i]; // shorter rule maxIndex = i; } } /* Prune the antecedents according to the accuracy parameters */ for(int z=size-1;z>maxIndex;z--) m_Antds.removeElementAt(z); } /** * Prints this rule * * @param classAttr the class attribute in the data * @return a textual description of this rule */ public String toString(Attribute classAttr) { StringBuffer text = new StringBuffer(); if(m_Antds.size() > 0){ for(int j=0; j< (m_Antds.size()-1); j++) text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and "); text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")"); } text.append(" => " + classAttr.name() + "=" + classAttr.value((int)m_Consequent)); return text.toString(); } } /** * Builds Ripper in the order of class frequencies. For each class * it's built in two stages: building and optimization * * @param instances the training data * @exception Exception if classifier can't be built successfully */ public void buildClassifier(Instances instances) throws Exception { if(instances.numInstances() == 0) throw new Exception(" No instances with a class value!"); if (instances.checkForStringAttributes()) throw new UnsupportedAttributeTypeException(" Cannot handle string attributes!"); if (!instances.classAttribute().isNominal()) throw new UnsupportedClassTypeException(" Only nominal class, please."); m_Random = new Random(m_Seed); m_Total = RuleStats.numAllConditions(instances); if(m_Debug) System.err.println("Number of all possible conditions = "+m_Total); Instances data = null; m_Filter = new ClassOrder(); // Sth. to make the class order different each time in cross-validations Instance inst = instances.instance((int)(m_Random.nextDouble()*(double)instances.numInstances())); ((ClassOrder)m_Filter).setSeed((long)inst.toString().hashCode()); ((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND); m_Filter.setInputFormat(instances); data = Filter.useFilter(instances, m_Filter); if(data == null) throw new Exception(" Unable to randomize the class orders."); data.deleteWithMissingClass(); if(data.numInstances() == 0) throw new Exception(" No instances with a class value!"); if(data.numInstances() < m_Folds) throw new Exception(" Not enough data for REP."); m_Class = data.classAttribute(); m_Ruleset = new FastVector(); m_RulesetStats = new FastVector(); m_Distributions = new FastVector(); // Sort by classes frequency double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts(); if(m_Debug){ System.err.println("Sorted classes:"); for(int x=0; x < m_Class.numValues(); x++) System.err.println(x+": "+m_Class.value(x) + " has " + orderedClasses[x] + " instances."); } // Iterate from less prevalent class to more frequent one oneClass: for(int y=0; y < data.numClasses()-1; y++){ // For each class double classIndex = (double)y; if(m_Debug){ int ci = (int)classIndex; System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): " + orderedClasses[y] + "instances\n"+ "=====================================\n"); } if(Utils.eq(orderedClasses[y],0.0)) // No data for this class continue oneClass; // The expected FP/err is the proportion of the class double all = 0; for(int i=y; i<orderedClasses.length; i++) all += orderedClasses[i]; double expFPRate = orderedClasses[y] / all; double classYWeights = 0, totalWeights = 0; for(int j=0; j < data.numInstances(); j++){ Instance datum = data.instance(j); totalWeights += datum.weight(); if((int)datum.classValue() == y){ classYWeights += datum.weight(); } } // DL of default rule, no theory DL, only data DL double defDL; if(classYWeights > 0) defDL = RuleStats.dataDL(expFPRate, 0.0, totalWeights, 0.0, classYWeights); else continue oneClass; // Subsumed by previous rules if(Double.isNaN(defDL) || Double.isInfinite(defDL)) throw new Exception("Should never happen: "+ "defDL NaN or infinite!"); if(m_Debug) System.err.println("The default DL = "+defDL); data = rulesetForOneClass(expFPRate, data, classIndex, defDL); } // Set the default rule RipperRule defRule = new RipperRule(); defRule.setConsequent((double)(data.numClasses()-1)); m_Ruleset.addElement(defRule); RuleStats defRuleStat = new RuleStats(); defRuleStat.setData(data); defRuleStat.setNumAllConds(m_Total); defRuleStat.addAndUpdate(defRule); m_RulesetStats.addElement(defRuleStat); for(int z=0; z < m_RulesetStats.size(); z++){ RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z); for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){ double[] classDist = oneClass.getDistributions(xyz); Utils.normalize(classDist); if(classDist != null) m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist)); } } } /** * Classify the test instance with the rule learner and provide * the class distributions * * @param datum the instance to be classified * @return the distribution */ public double[] distributionForInstance(Instance datum){ try{ for(int i=0; i < m_Ruleset.size(); i++){ RipperRule rule = (RipperRule)m_Ruleset.elementAt(i); if(rule.covers(datum)) return (double[])m_Distributions.elementAt(i); } }catch(Exception e){ System.err.println(e.getMessage()); e.printStackTrace(); } System.err.println("Should never happen!"); return new double[datum.classAttribute().numValues()]; } /** Build a ruleset for the given class according to the given data * * @param expFPRate the expected FP/(FP+FN) used in DL calculation * @param data the given data * @param classIndex the given class index * @param defDL the default DL in the data * @exception if the ruleset can be built properly */ protected Instances rulesetForOneClass(double expFPRate, Instances data, double classIndex, double defDL) throws Exception { Instances newData = data, growData, pruneData; boolean stop = false; FastVector ruleset = new FastVector(); double dl = defDL, minDL = defDL; RuleStats rstats = null; double[] rst; // Check whether data have positive examples boolean defHasPositive = true; // No longer used boolean hasPositive = defHasPositive; /********************** Building stage ***********************/ if(m_Debug) System.err.println("\n*** Building stage ***"); while((!stop) && hasPositive){ // Generate new rules until // stopping criteria met RipperRule oneRule; if(m_UsePruning){ /* Split data into Grow and Prune*/ // We should have stratified the data, but ripper seems // to have a bug that makes it not to do so. In order // to simulate it more precisely, we do the same thing. //newData.randomize(m_Random); newData = RuleStats.stratify(newData, m_Folds, m_Random); Instances[] part = RuleStats.partition(newData, m_Folds); growData=part[0]; pruneData=part[1]; //growData=newData.trainCV(m_Folds, m_Folds-1); //pruneData=newData.testCV(m_Folds, m_Folds-1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -