📄 conjunctiverule.java
字号:
if(m_ClassAttribute.isNominal()){ fstEntp = entropy(fst, fstCover); sndEntp = entropy(snd, sndCover); } else{ fstEntp = wtMeanSqErr(fstWtSq, fstWtVl, fstCover)/fstCover; sndEntp = wtMeanSqErr(sndWtSq, sndWtVl, sndCover)/sndCover; } /* Which bag has higher information gain? */ boolean isFirst; double fstInfoGain, sndInfoGain; double info, infoGain, fstInfo, sndInfo; if(m_ClassAttribute.isNominal()){ double sum = data.sumOfWeights(); double otherCover, whole = sum + Utils.sum(uncover), otherEntropy; double[] other = null; // InfoGain of first bag other = new double[m_NumClasses]; for(int z=0; z < m_NumClasses; z++) other[z] = uncover[z] + snd[z] + missing[z]; otherCover = whole - fstCover; otherEntropy = entropy(other, otherCover); // Weighted average fstInfo = (fstEntp*fstCover + otherEntropy*otherCover)/whole; fstInfoGain = defInfo - fstInfo; // InfoGain of second bag other = new double[m_NumClasses]; for(int z=0; z < m_NumClasses; z++) other[z] = uncover[z] + fst[z] + missing[z]; otherCover = whole - sndCover; otherEntropy = entropy(other, otherCover); // Weighted average sndInfo = (sndEntp*sndCover + otherEntropy*otherCover)/whole; sndInfoGain = defInfo - sndInfo; } else{ double sum = data.sumOfWeights(); double otherWtSq = (sndWtSq + msingWtSq + uncoverWtSq), otherWtVl = (sndWtVl + msingWtVl + uncoverWtVl), otherCover = (sum - fstCover + uncoverSum); fstInfo = Utils.eq(fstCover, 0) ? 0 : (fstEntp * fstCover); fstInfo += wtMeanSqErr(otherWtSq, otherWtVl, otherCover); fstInfoGain = defInfo - fstInfo; otherWtSq = (fstWtSq + msingWtSq + uncoverWtSq); otherWtVl = (fstWtVl + msingWtVl + uncoverWtVl); otherCover = sum - sndCover + uncoverSum; sndInfo = Utils.eq(sndCover, 0) ? 0 : (sndEntp * sndCover); sndInfo += wtMeanSqErr(otherWtSq, otherWtVl, otherCover); sndInfoGain = defInfo - sndInfo; } if(Utils.gr(fstInfoGain,sndInfoGain) || (Utils.eq(fstInfoGain,sndInfoGain)&&(Utils.sm(fstEntp,sndEntp)))){ isFirst = true; infoGain = fstInfoGain; info = fstInfo; } else{ isFirst = false; infoGain = sndInfoGain; info = sndInfo; } boolean isUpdate = Utils.gr(infoGain, maxInfoGain); /* Check whether so far the max infoGain */ if(isUpdate){ splitPoint = ((data.instance(split).value(att)) + (data.instance(prev).value(att)))/2.0; value = ((isFirst) ? 0 : 1); inform = info; maxInfoGain = infoGain; finalSplit = split; } prev=split; } } /* Split the data */ Instances[] splitData = new Instances[3]; splitData[0] = new Instances(data, 0, finalSplit); splitData[1] = new Instances(data, finalSplit, total-finalSplit); splitData[2] = new Instances(missingData); return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is covered * by this antecedent */ public boolean isCover(Instance inst){ boolean isCover=false; if(!inst.isMissing(att)){ if(Utils.eq(value, 0)){ if(Utils.smOrEq(inst.value(att), splitPoint)) isCover=true; } else if(Utils.gr(inst.value(att), splitPoint)) isCover=true; } return isCover; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { String symbol = Utils.eq(value, 0.0) ? " <= " : " > "; return (att.name() + symbol + Utils.doubleToString(splitPoint, 6)); } } /** * The antecedent with nominal attribute */ class NominalAntd extends Antd { /** for serialization */ static final long serialVersionUID = -5949864163376447424L; /* The parameters of infoGain calculated for each attribute value */ private double[][] stats; private double[] coverage; private boolean isIn; /** * Constructor for nominal class */ public NominalAntd(Attribute a, double[] unc){ super(a, unc); int bag = att.numValues(); stats = new double[bag][m_NumClasses]; coverage = new double[bag]; isIn = true; } /** * Constructor for numeric class */ public NominalAntd(Attribute a, double sq, double vl, double wts){ super(a, sq, vl, wts); int bag = att.numValues(); stats = null; coverage = new double[bag]; isIn = true; } /** * Implements the splitData function. * This procedure is to split the data into bags according * to the nominal attribute value * the data with missing values are stored in the last bag. * The infoGain for each bag is also calculated. * * @param data the data to be split * @param defInfo the default information for data * @return the array of data after split */ public Instances[] splitData(Instances data, double defInfo){ int bag = att.numValues(); Instances[] splitData = new Instances[bag+1]; double[] wSq = new double[bag]; double[] wVl = new double[bag]; double totalWS=0, totalWV=0, msingWS=0, msingWV=0, sum=data.sumOfWeights(); double[] all = new double[m_NumClasses]; double[] missing = new double[m_NumClasses]; for(int w=0; w < m_NumClasses; w++) all[w] = missing[w] = 0; for(int x=0; x<bag; x++){ coverage[x] = wSq[x] = wVl[x] = 0; if(stats != null) for(int y=0; y < m_NumClasses; y++) stats[x][y] = 0; splitData[x] = new Instances(data, data.numInstances()); } splitData[bag] = new Instances(data, data.numInstances()); // Record the statistics of data for(int x=0; x<data.numInstances(); x++){ Instance inst=data.instance(x); if(!inst.isMissing(att)){ int v = (int)inst.value(att); splitData[v].add(inst); coverage[v] += inst.weight(); if(m_ClassAttribute.isNominal()){ // Nominal class stats[v][(int)inst.classValue()] += inst.weight(); all[(int)inst.classValue()] += inst.weight(); } else{ // Numeric class wSq[v] += inst.weight() * inst.classValue() * inst.classValue(); wVl[v] += inst.weight() * inst.classValue(); totalWS += inst.weight() * inst.classValue() * inst.classValue(); totalWV += inst.weight() * inst.classValue(); } } else{ splitData[bag].add(inst); if(m_ClassAttribute.isNominal()){ // Nominal class all[(int)inst.classValue()] += inst.weight(); missing[(int)inst.classValue()] += inst.weight(); } else{ // Numeric class totalWS += inst.weight() * inst.classValue() * inst.classValue(); totalWV += inst.weight() * inst.classValue(); msingWS += inst.weight() * inst.classValue() * inst.classValue(); msingWV += inst.weight() * inst.classValue(); } } } // The total weights of the whole grow data double whole; if(m_ClassAttribute.isNominal()) whole = sum + Utils.sum(uncover); else whole = sum + uncoverSum; // Find the split double minEntrp=Double.MAX_VALUE; maxInfoGain = 0; // Check if >=2 splits have more than the minimal data int count=0; for(int x=0; x<bag; x++) if(Utils.grOrEq(coverage[x], m_MinNo)) ++count; if(count < 2){ // Don't split maxInfoGain = 0; inform = defInfo; value = Double.NaN; return null; } for(int x=0; x<bag; x++){ double t = coverage[x], entrp, infoGain; if(Utils.sm(t, m_MinNo)) continue; if(m_ClassAttribute.isNominal()){ // Nominal class double[] other = new double[m_NumClasses]; for(int y=0; y < m_NumClasses; y++) other[y] = all[y] - stats[x][y] + uncover[y]; double otherCover = whole - t; // Entropies of data covered and uncovered entrp = entropy(stats[x], t); double uncEntp = entropy(other, otherCover); // Weighted average infoGain = defInfo - (entrp*t + uncEntp*otherCover)/whole; } else{ // Numeric class double weight = (whole - t); entrp = wtMeanSqErr(wSq[x], wVl[x], t)/t; infoGain = defInfo - (entrp * t) - wtMeanSqErr((totalWS-wSq[x]+uncoverWtSq), (totalWV-wVl[x]+uncoverWtVl), weight); } // Test the exclusive expression boolean isWithin =true; if(m_IsExclude){ double infoGain2, entrp2; if(m_ClassAttribute.isNominal()){ // Nominal class double[] other2 = new double[m_NumClasses]; double[] notIn = new double[m_NumClasses]; for(int y=0; y < m_NumClasses; y++){ other2[y] = stats[x][y] + missing[y] + uncover[y]; notIn[y] = all[y] - stats[x][y] - missing[y]; } double msSum = Utils.sum(missing); double otherCover2 = t + msSum + Utils.sum(uncover); entrp2 = entropy(notIn, (sum-t-msSum)); double uncEntp2 = entropy(other2, otherCover2); infoGain2 = defInfo - (entrp2*(sum-t-msSum) + uncEntp2*otherCover2)/whole; } else{ // Numeric class double msWts = splitData[bag].sumOfWeights(); double weight2 = t + uncoverSum + msWts; entrp2 = wtMeanSqErr((totalWS-wSq[x]-msingWS), (totalWV-wVl[x]-msingWV),(sum-t-msWts)) /(sum-t-msWts); infoGain2 = defInfo - entrp2 * (sum-t-msWts) - wtMeanSqErr((wSq[x]+uncoverWtSq+msingWS), (wVl[x]+uncoverWtVl+msingWV), weight2); } // Use the exclusive expression? if (Utils.gr(infoGain2, infoGain) || (Utils.eq(infoGain2, infoGain) && Utils.sm(entrp2, entrp))){ infoGain = infoGain2; entrp = entrp2; isWithin =false; } } // Test this split if (Utils.gr(infoGain, maxInfoGain) || (Utils.eq(infoGain, maxInfoGain) && Utils.sm(entrp, minEntrp))){ value = (double)x; maxInfoGain = infoGain; inform = maxInfoGain - defInfo; minEntrp = entrp; isIn = isWithin; } } return splitData; } /** * Whether the instance is covered by this antecedent * * @param inst the instance in question * @return the boolean value indicating whether the instance is covered * by this antecedent */ public boolean isCover(Instance inst){ boolean isCover=false; if(!inst.isMissing(att)){ if(isIn){ if(Utils.eq(inst.value(att), value)) isCover=true; } else if(!Utils.eq(inst.value(att), value)) isCover=true; } return isCover; } /** * Whether the expression is "att = value" or att != value" * for this nominal attribute. True if in the former expression, * otherwise the latter * * @return the boolean value */ public boolean isIn(){ return isIn; } /** * Prints this antecedent * * @return a textual description of this antecedent */ public String toString() { String symbol = isIn ? " = " : " != "; return (att.name() + symbol + att.value((int)value)); } } /** * Returns an enumeration describing the available options * Valid options are: <p> * * -N number <br> * Set number of folds for REP. One fold is * used as the pruning set. (Default: 3) <p> * * -R <br> * Set if NOT randomize the data before split to growing and * pruning data. If NOT set, the seed of randomization is * specified by the -S option. (Default: randomize) <p> * * -S <br> * Seed of randomization. (Default: 1)<p> * * -E <br> * Set whether consider the exclusive expressions for nominal * attribute split. (Default: false) <p> * * -M number <br> * Set the minimal weights of instances within a split. * (Default: 2) <p> * * -P number <br> * Set the number of antecedents allowed in the rule if pre-pruning * is used. If this value is other than -1, then pre-pruning will be * used, otherwise the rule uses REP. (Default: -1) <p> * * @return an enumeration of all the available options */ public Enumeration listOptions() { Vector newVector = new Vector(6); newVector.addElement(new Option("\tSet number of folds for REP\n" + "\tOne fold is used as pruning set.\n" + "\t(default 3)","N", 1, "-N <number of folds>")); newVector.addElement(new Option("\tSet if NOT uses randomization\n" +
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -