📄 jrip.java
字号:
}
return isCover;
}
/**
* Whether this rule has antecedents, i.e. whether it is a default rule
*
* @return the boolean value indicating whether the rule has antecedents
*/
public boolean hasAntds(){
if (m_Antds == null)
return false;
else
return (m_Antds.size() > 0);
}
/**
* the number of antecedents of the rule
*
* @return the size of this rule
*/
public double size(){ return (double)m_Antds.size(); }
/**
* Private function to compute default number of accurate instances
* in the specified data for the consequent of the rule
*
* @param data the data in question
* @return the default accuracy number
*/
private double computeDefAccu(Instances data){
double defAccu=0;
for(int i=0; i<data.numInstances(); i++){
Instance inst = data.instance(i);
if((int)inst.classValue() == (int)m_Consequent)
defAccu += inst.weight();
}
return defAccu;
}
/**
* Build one rule using the growing data
*
* @param data the growing data used to build the rule
* @exception if the consequent is not set yet
*/
public void grow(Instances data) throws Exception {
if(m_Consequent == -1)
throw new Exception(" Consequent not set yet.");
Instances growData = data;
double sumOfWeights = growData.sumOfWeights();
if(!Utils.gr(sumOfWeights, 0.0))
return;
/* Compute the default accurate rate of the growing data */
double defAccu = computeDefAccu(growData);
double defAcRt = (defAccu+1.0)/(sumOfWeights+1.0);
/* Keep the record of which attributes have already been used*/
boolean[] used=new boolean [growData.numAttributes()];
for (int k=0; k<used.length; k++)
used[k]=false;
int numUnused=used.length;
// If there are already antecedents existing
for(int j=0; j < m_Antds.size(); j++){
Antd antdj = (Antd)m_Antds.elementAt(j);
if(!antdj.getAttr().isNumeric()){
used[antdj.getAttr().index()]=true;
numUnused--;
}
}
double maxInfoGain;
while (Utils.gr(growData.numInstances(), 0.0) &&
(numUnused > 0)
&& Utils.sm(defAcRt, 1.0)
){
// We require that infoGain be positive
/*if(numAntds == originalSize)
maxInfoGain = 0.0; // At least one condition allowed
else
maxInfoGain = Utils.eq(defAcRt, 1.0) ?
defAccu/(double)numAntds : 0.0; */
maxInfoGain = 0.0;
/* Build a list of antecedents */
Antd oneAntd=null;
Instances coverData = null;
Enumeration emAttr=growData.emerateAttributes();
/* Build one condition based on all attributes not used yet*/
while (emAttr.hasMoreElements()){
Attribute att= (Attribute)(emAttr.nextElement());
if(m_Debug)
System.err.println("\nOne condition: size = "
+ growData.sumOfWeights());
Antd antd =null;
if(att.isNumeric())
antd = new NumericAntd(att);
else
antd = new NominalAntd(att);
if(!used[att.index()]){
/* Compute the best information gain for each attribute,
it's stored in the antecedent formed by this attribute.
This procedure returns the data covered by the antecedent*/
Instances coveredData = computeInfoGain(growData, defAcRt,
antd);
if(coveredData != null){
double infoGain = antd.getMaxInfoGain();
if(m_Debug)
System.err.println("Test of \'"+antd.toString()+
"\': infoGain = "+
infoGain + " | Accuracy = " +
antd.getAccuRate()+
"="+antd.getAccu()
+"/"+antd.getCover()+
" def. accuracy: "+defAcRt);
if(infoGain > maxInfoGain){
oneAntd=antd;
coverData = coveredData;
maxInfoGain = infoGain;
}
}
}
}
if(oneAntd == null) break; // Cannot find antds
if(Utils.sm(oneAntd.getAccu(), m_MinNo)) break;// Too low coverage
//Numeric attributes can be used more than once
if(!oneAntd.getAttr().isNumeric()){
used[oneAntd.getAttr().index()]=true;
numUnused--;
}
m_Antds.addElement(oneAntd);
growData = coverData;// Grow data size is shrinking
defAcRt = oneAntd.getAccuRate();
}
}
/**
* Compute the best information gain for the specified antecedent
*
* @param instances the data based on which the infoGain is computed
* @param defAcRt the default accuracy rate of data
* @param antd the specific antecedent
* @param numConds the number of antecedents in the rule so far
* @return the data covered by the antecedent
*/
private Instances computeInfoGain(Instances instances, double defAcRt,
Antd antd){
Instances data = instances;
/* Split the data into bags.
The information gain of each bag is also calculated in this procedure */
Instances[] splitData = antd.splitData(data, defAcRt,
m_Consequent);
/* Get the bag of data to be used for next antecedents */
if(splitData != null)
return splitData[(int)antd.getAttrValue()];
else return null;
}
/**
* Prune all the possible final sequences of the rule using the
* pruning data. The measure used to prune the rule is based on
* flag given.
*
* @param pruneData the pruning data used to prune the rule
* @param useWhole flag to indicate whether use the error rate of
* the whole pruning data instead of the data covered
*/
public void prune(Instances pruneData, boolean useWhole){
Instances data = pruneData;
double total = data.sumOfWeights();
if(!Utils.gr(total, 0.0))
return;
/* The default accurate # and rate on pruning data */
double defAccu=computeDefAccu(data);
if(m_Debug)
System.err.println("Pruning with " + defAccu +
" positive data out of " + total +
" instances");
int size=m_Antds.size();
if(size == 0) return; // Default rule before pruning
double[] worthRt = new double[size];
double[] coverage = new double[size];
double[] worthValue = new double[size];
for(int w=0; w<size; w++){
worthRt[w]=coverage[w]=worthValue[w]=0.0;
}
/* Calculate accuracy parameters for all the antecedents in this rule */
double tn = 0.0; // True negative if useWhole
for(int x=0; x<size; x++){
Antd antd=(Antd)m_Antds.elementAt(x);
Attribute attr= antd.getAttr();
Instances newData = data;
data = new Instances(newData, 0); // Make data empty
for(int y=0; y<newData.numInstances(); y++){
Instance ins=newData.instance(y);
if(antd.covers(ins)){ // Covered by this antecedent
coverage[x] += ins.weight();
data.add(ins); // Add to data for further pruning
if((int)ins.classValue() == (int)m_Consequent) // Accurate prediction
worthValue[x] += ins.weight();
}
else if(useWhole){ // Not covered
if((int)ins.classValue() != (int)m_Consequent)
tn += ins.weight();
}
}
if(useWhole){
worthValue[x] += tn;
worthRt[x] = worthValue[x] / total;
}
else // Note if coverage is 0, accuracy is 0.5
worthRt[x] = (worthValue[x]+1.0)/(coverage[x]+2.0);
}
double maxValue = (defAccu+1.0)/(total+2.0);
int maxIndex = -1;
for(int i=0; i<worthValue.length; i++){
if(m_Debug){
double denom = useWhole ? total : coverage[i];
System.err.println(i+"(useAccuray? "+!useWhole+"): "
+ worthRt[i] +
"="+worthValue[i]+
"/"+denom);
}
if(worthRt[i] > maxValue){ // Prefer to the
maxValue = worthRt[i]; // shorter rule
maxIndex = i;
}
}
/* Prune the antecedents according to the accuracy parameters */
for(int z=size-1;z>maxIndex;z--)
m_Antds.removeElementAt(z);
}
/**
* Prints this rule
*
* @param classAttr the class attribute in the data
* @return a textual description of this rule
*/
public String toString(Attribute classAttr) {
StringBuffer text = new StringBuffer();
if(m_Antds.size() > 0){
for(int j=0; j< (m_Antds.size()-1); j++)
text.append("(" + ((Antd)(m_Antds.elementAt(j))).toString()+ ") and ");
text.append("("+((Antd)(m_Antds.lastElement())).toString() + ")");
}
text.append(" => " + classAttr.name() +
"=" + classAttr.value((int)m_Consequent));
return text.toString();
}
}
/**
* Builds Ripper in the order of class frequencies. For each class
* it's built in two stages: building and optimization
*
* @param instances the training data
* @exception Exception if classifier can't be built successfully
*/
public void buildClassifier(Instances instances) throws Exception {
if(instances.numInstances() == 0)
throw new Exception(" No instances with a class value!");
if (instances.checkForStringAttributes())
throw new UnsupportedAttributeTypeException(" Cannot handle string attributes!");
if (!instances.classAttribute().isNominal())
throw new UnsupportedClassTypeException(" Only nominal class, please.");
m_Random = instances.getRandomNumberGenerator(m_Seed);
m_Total = RuleStats.numAllConditions(instances);
if(m_Debug)
System.err.println("Number of all possible conditions = "+m_Total);
Instances data = null;
m_Filter = new ClassOrder();
((ClassOrder)m_Filter).setSeed(m_Random.nextInt());
((ClassOrder)m_Filter).setClassOrder(ClassOrder.FREQ_ASCEND);
m_Filter.setInputFormat(instances);
data = Filter.useFilter(instances, m_Filter);
if(data == null)
throw new Exception(" Unable to randomize the class orders.");
data.deleteWithMissingClass();
if(data.numInstances() == 0)
throw new Exception(" No instances with a class value!");
if(data.numInstances() < m_Folds)
throw new Exception(" Not enough data for REP.");
m_Class = data.classAttribute();
m_Ruleset = new FastVector();
m_RulesetStats = new FastVector();
m_Distributions = new FastVector();
// Sort by classes frequency
double[] orderedClasses = ((ClassOrder)m_Filter).getClassCounts();
if(m_Debug){
System.err.println("Sorted classes:");
for(int x=0; x < m_Class.numValues(); x++)
System.err.println(x+": "+m_Class.value(x) + " has " +
orderedClasses[x] + " instances.");
}
// Iterate from less prevalent class to more frequent one
oneClass:
for(int y=0; y < data.numClasses()-1; y++){ // For each class
double classIndex = (double)y;
if(m_Debug){
int ci = (int)classIndex;
System.err.println("\n\nClass "+m_Class.value(ci)+"("+ci+"): "
+ orderedClasses[y] + "instances\n"+
"=====================================\n");
}
if(Utils.eq(orderedClasses[y],0.0)) // No data for this class
continue oneClass;
// The expected FP/err is the proportion of the class
double all = 0;
for(int i=y; i<orderedClasses.length; i++)
all += orderedClasses[i];
double expFPRate = orderedClasses[y] / all;
double classYWeights = 0, totalWeights = 0;
for(int j=0; j < data.numInstances(); j++){
Instance datum = data.instance(j);
totalWeights += datum.weight();
if((int)datum.classValue() == y){
classYWeights += datum.weight();
}
}
// DL of default rule, no theory DL, only data DL
double defDL;
if(classYWeights > 0)
defDL = RuleStats.dataDL(expFPRate,
0.0,
totalWeights,
0.0,
classYWeights);
else
continue oneClass; // Subsumed by previous rules
if(Double.isNaN(defDL) || Double.isInfinite(defDL))
throw new Exception("Should never happen: "+
"defDL NaN or infinite!");
if(m_Debug)
System.err.println("The default DL = "+defDL);
data = rulesetForOneClass(expFPRate, data, classIndex, defDL);
}
// Set the default rule
RipperRule defRule = new RipperRule();
defRule.setConsequent((double)(data.numClasses()-1));
m_Ruleset.addElement(defRule);
RuleStats defRuleStat = new RuleStats();
defRuleStat.setData(data);
defRuleStat.setNumAllConds(m_Total);
defRuleStat.addAndUpdate(defRule);
m_RulesetStats.addElement(defRuleStat);
for(int z=0; z < m_RulesetStats.size(); z++){
RuleStats oneClass = (RuleStats)m_RulesetStats.elementAt(z);
for(int xyz=0; xyz < oneClass.getRulesetSize(); xyz++){
double[] classDist = oneClass.getDistributions(xyz);
Utils.normalize(classDist);
if(classDist != null)
m_Distributions.addElement(((ClassOrder)m_Filter).distributionsByOriginalIndex(classDist));
}
}
}
/**
* Classify the test instance with the rule learner and provide
* the class distributions
*
* @param datum the instance to be classified
* @return the distribution
*/
public double[] distributionForInstance(Instance datum){
try{
for(int i=0; i < m_Ruleset.size(); i++){
RipperRule rule = (RipperRule)m_Ruleset.elementAt(i);
if(rule.covers(datum))
return (double[])m_Distributions.elementAt(i);
}
}catch(Exception e){
System.err.println(e.getMessage());
e.printStackTrace();
}
System.err.println("Should never happen!");
return new double[datum.classAttribute().numValues()];
}
/** Build a ruleset for the given class according to the given data
*
* @param expFPRate the expected FP/(FP+FN) used in DL calculation
* @param data the given data
* @param classIndex the given class index
* @param defDL the default DL in the data
* @exception if the ruleset can be built properly
*/
protected Instances rulesetForOneClass(double expFPRate,
Instances data,
double classIndex,
double defDL)
throws Exception {
Instances newData = data, growData, pruneData;
boolean stop = false;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -