📄 datasource.cpp
字号:
else { pos++; } } pos=N.begin(); while(pos!=N.end()) { pTuple=*pos; if(pTuple->attr[lit.m_Var]!=lit.m_Value) { old_pos=pos; pos++; N.erase(old_pos); N_removed.push_back(pTuple); } else { pos++; } } //if most of the tuples are removed, we recalculate aggr by tuples remained, //otherwise we update aggr by tuples removed. if(P.size()+N.size()<=P_removed.size()+N_removed.size()) { aggr.Initialize(m_Descript); nP=0; nN=0; for(pos=P.begin();pos!=P.end();pos++) { pTuple=*pos; nP+=pTuple->weight; for(i=0;i<nAttr;i++) aggr.m_Data[i][pTuple->attr[i]][1]+=pTuple->weight; } for(pos=N.begin();pos!=N.end();pos++) { pTuple=*pos; nN+=1.0; for(i=0;i<nAttr;i++) aggr.m_Data[i][pTuple->attr[i]][0]+=1.0; //negative example always has weight 1.0 } } else { for(pos=P_removed.begin();pos!=P_removed.end();pos++) { pTuple=*pos; nP-=pTuple->weight; for(i=0;i<nAttr;i++) aggr.m_Data[i][pTuple->attr[i]][1]-=pTuple->weight; } for(pos=N_removed.begin();pos!=N_removed.end(); pos++) { pTuple=*pos; nN-=1.0; for(i=0;i<nAttr;i++) aggr.m_Data[i][pTuple->attr[i]][0]-=1.0; } } }//end of searching for a rule if(R.GetSize()>0) temp_ruleset.push_back(R);// printf("--");// R.WriteTo(stdout); if(m_RuleStack.empty()) { if(temp_ruleset.empty()) break; for(RULESET::iterator pos1=temp_ruleset.begin();pos1!=temp_ruleset.end();pos1++) { R=*pos1; Rules.push_back(R); //decrease the weights of all postive tuples satisfying R. Remove it if its weight<0.1. TUPLE* pTuple; pos=AP.begin(); while(pos!=AP.end()) { bool removed=false; pTuple=*pos; if(R.Satisfy(*pTuple)) { double weight_lost; double old_weight=pTuple->weight; pTuple->weight*=0.66667; weight_lost=old_weight-pTuple->weight; if(pTuple->weight<0) { removed=true; old_pos=pos; pos++; AP.erase(old_pos); weight_lost=old_weight; } //update total_weight and total_aggr total_weight-=weight_lost; for(i=0;i<nAttr;i++) total_aggr.m_Data[i][pTuple->attr[i]][1]-=weight_lost; } if(!removed) ++pos; } } temp_ruleset.clear(); } }//end of searching for all rules}int DataSource::GenRulesFOIL(CString outputfilename)//return number of rules{/* RULESET Rules,Rules2,Rules3; for(int iClass=0;iClass<m_Descript.GetNumClasses();iClass++) { GenRulesFOIL(iClass,Rules); } for(POSITION pos=Rules.GetHeadPosition();pos!=NULL;Rules.GetNext(pos)) { RULE rule=Rules.GetAt(pos); double gain=EvaluateRule(rule,false); rule.SetInfoGain(gain); Rules3.AddRuleByGain(rule); } RULE newrule; for(pos=Rules3.GetHeadPosition();pos!=NULL;Rules3.GetNext(pos)) { RULE rule=Rules3.GetAt(pos); double gain=EvaluateRule(rule,false); rule.SetInfoGain(gain); if(rule==newrule) { newrule.SetInfoGain(newrule.GetInfoGain()+rule.GetInfoGain()); } else { if(newrule.GetSize()>0) Rules2.AddRuleByGain(newrule); newrule=rule; } } if(newrule.GetSize()>0) Rules2.AddRuleByGain(newrule); Rules2.WriteTo(outputfilename); return Rules2.size();*/ RULESET Rules,Rules2,Rules3; for(int iClass=0;iClass<m_Descript.GetNumClasses();iClass++) { GenRulesFOIL(iClass,Rules); } for(RULESET::iterator pos=Rules.begin();pos!=Rules.end();pos++) { RULE rule=*pos; double gain=EvaluateRule(rule); rule.SetInfoGain(gain); Rules3.AddRule(rule); } RULE newrule; for(RULESET::iterator pos=Rules3.begin();pos!=Rules3.end();pos++) { RULE rule=*pos; Rules2.AddRuleByGain(rule); } Rules2.WriteTo(outputfilename); return Rules2.size();}int DataSource::Classify(const TUPLE& tuple,int &iRule) const{/*// Classify with best single rule double max_conf=-1.0; int res_class=0; for(int i=0;i<m_nRules;i++) { if(m_Rules[i].Satisfy(tuple)) { if(m_Rules[i].GetInfoGain()>max_conf) { max_conf=m_Rules[i].GetInfoGain(); res_class=m_Rules[i].GetClass(); } } } return res_class;*/// Vote among rules// double m=m_nRules/(double)(m_Descript.GetNumClasses());// double a=1-exp(-4.0/m);// int nChosen=(int)(2*m*a+0.5); int i; double max_conf=-1.0; int res_class=0; double* conf=new double[m_Descript.GetNumClasses()]; double* high_conf=new double[m_Descript.GetNumClasses()]; int* i_rule=new int[m_Descript.GetNumClasses()]; int* count=new int[m_Descript.GetNumClasses()]; for(i=0;i<m_Descript.GetNumClasses();i++) { conf[i]=0; high_conf[i]=0; i_rule[i]=-1; count[i]=0; } for(i=0;i<m_nRules;i++) { if(m_Rules[i].Satisfy(tuple)) { int cls=m_Rules[i].GetClass(); double gain=m_Rules[i].GetInfoGain(); double c=conf[cls]; if(count[cls]<5) {// conf[cls]=1-(1-c)*(1-m_Rules[i].GetInfoGain()); conf[cls]+=m_Rules[i].GetInfoGain(); count[cls]++; } if(gain>high_conf[cls]) { high_conf[cls]=gain; i_rule[cls]=i; } } } for(i=0;i<m_Descript.GetNumClasses();i++) { if(conf[i]>max_conf) { res_class=i; max_conf=conf[i]; } } iRule=i_rule[res_class]; delete conf; delete high_conf; delete i_rule; delete count; return res_class;}double DataSource::Classify(CString outputfilename) const{ int nClasses=m_Descript.GetNumClasses(); int nCorrect=0; int* nTuples=new int[nClasses]; int* nTruePositive=new int[nClasses];; int* nFalsePositive=new int[nClasses]; memset(nTuples,0,sizeof(int)*nClasses); memset(nTruePositive,0,sizeof(int)*nClasses); memset(nFalsePositive,0,sizeof(int)*nClasses); FILE* out=fopen(outputfilename,"w"); fprintf(out,"TID TRUE_LABEL PRED_LABEL RULE_IDX RULE_SCORE\n"); for(int iTuple=0;iTuple<m_nTuples;iTuple++) { int label_class=m_Tuples[iTuple].cls; nTuples[label_class]++; int iRule=0; int pred_class=Classify(m_Tuples[iTuple],iRule); if(pred_class==label_class) { nCorrect++; nTruePositive[label_class]++; } else { nFalsePositive[pred_class]++; } fprintf(out,"%d\t%d\t%d\t%d\t%.3f\n",iTuple,label_class,pred_class,iRule,m_Rules[iRule].GetInfoGain()); } fprintf(out,"\nAccuracy: %.2f%%\n",100.0*nCorrect/m_nTuples); printf("Accuracy: %.2f%%\n",100.0*nCorrect/m_nTuples); for(int iClass=0;iClass<nClasses;iClass++) { fprintf(out,"CLASS %d: #Tuples %d precision %.2f%%, recall %.2f%%\n",iClass,nTuples[iClass],100.0*nTruePositive[iClass]/(nTruePositive[iClass]+nFalsePositive[iClass]),100.0*nTruePositive[iClass]/nTuples[iClass]); } fclose(out); delete nTuples; delete nTruePositive; delete nFalsePositive; return nCorrect/(double)m_nTuples;}int compare_rule(const void* elem1,const void* elem2){ RULE** pRule1=(RULE**)elem1; RULE** pRule2=(RULE**)elem2; if((*pRule1)->GetInfoGain()>(*pRule2)->GetInfoGain()) return -1; else return 1;}void DataSource::EvaluateRules(CString outputfile){ FILE* fout=fopen(outputfile,"w"); int sup; double conf,lift; int nClasses=m_Descript.GetNumClasses(); for(int iClass=0;iClass<nClasses;iClass++) { fprintf(fout,"\nRules for class %d\n",iClass); for(int iRule=0;iRule<m_nRules;iRule++) { if(m_Rules[iRule].GetClass()==iClass) { EvaluateRule(m_Rules[iRule],sup,conf,lift); fprintf(fout,"sup=%d,conf=%.3f,lift=%.3f ",sup,conf,lift); m_Rules[iRule].WriteTo(fout); } } } fclose(fout);}void DataSource::IntepretRules(CString rule_semantics_file,CString outputfilename) const{ vector<CString>* Semantics=new vector<CString>[m_Descript.GetNumAttr()]; vector<CString> Class_labels; int temp; FILE* fin=fopen(rule_semantics_file,"r"); fscanf(fin,"%d attributes\n",&temp); assert(temp==m_Descript.GetNumAttr()); char buf[200]; for(int i=0;i<m_Descript.GetNumAttr();i++) { int iAttr,nValue; fscanf(fin,"\nattribute%d %d %s\n",&iAttr,&nValue,buf); assert(iAttr==i+1&&nValue==m_Descript.GetNumValues(i)); CString attr_name=buf; Semantics[i].push_back(attr_name); for(int j=0;j<nValue;j++) { fgets(buf,200,fin); CString value_name=buf; int len=value_name.GetLength(); if(value_name.GetAt(len-1)=='\n') value_name=value_name.Left(len-1); Semantics[i].push_back(value_name); } } int nClasses; fscanf(fin,"\nclass_label %d\n",&nClasses); assert(nClasses==m_Descript.num_classes); for(int i=0;i<nClasses;i++) { fgets(buf,200,fin); CString class_label=buf; int len=class_label.GetLength(); if(class_label.GetAt(len-1)=='\n') class_label=class_label.Left(len-1); Class_labels.push_back(class_label); } fclose(fin); RULE** pRules=new RULE*[m_nRules]; for(int i=0;i<m_nRules;i++) pRules[i]=&(m_Rules[i]); qsort(pRules,m_nRules,sizeof(RULE*),compare_rule); FILE* fout=fopen(outputfilename,"w"); for(int i=0;i<m_nRules;i++) { RULE rule=*pRules[i]; int pos = rule.GetClass(); fprintf(fout,"%s (%.3f) :- ", (const char*) Class_labels[pos], rule.GetInfoGain()); for(int j=0;j<rule.GetSize();j++) { LITERAL lit=rule.GetLiteral(j); CString attr_name=Semantics[lit.GetVar()][0]; int pos=lit.GetValue()+1; CString value_name=Semantics[lit.GetVar()][pos]; fprintf(fout,"%s:%s, ", (const char*) attr_name, (const char*) value_name); } fprintf(fout,"\n"); } fclose(fout); delete[] Semantics; delete pRules;}void DataSource::GenAccrCurve(CString datafile,CString rulefile,CString outputfile){ ReadData(datafile); ReadRules(rulefile); int nAllRule=m_nRules; int nClass=m_Descript.GetNumClasses(); int step=nAllRule/nClass/50; if (step<1) step = 1; int maxNumRule=0; for(int iClass=0;iClass<nClass;iClass++) { int count=0; for(int iRule=0;iRule<nAllRule;iRule++) { if(m_Rules[iRule].GetClass()==iClass) count++; } if(count>maxNumRule) maxNumRule=count; } DataSource dsrc; dsrc.ReadData(datafile); FILE* fout=fopen(outputfile,"w"); for(int nRule=1;nRule<maxNumRule;nRule+=step) { FILE* frule=fopen("rule.txt","w"); for(int iClass=0;iClass<nClass;iClass++) { int count=0; for(int iRule=0;iRule<nAllRule;iRule++) { if(m_Rules[iRule].GetClass()==iClass) { m_Rules[iRule].WriteTo(frule); count++; if(count==nRule) break; } } } fclose(frule); dsrc.ReadRules("rule.txt"); double accr=dsrc.Classify("temp.txt"); fprintf(fout,"%d rules: %.4f\n",nRule,accr); } fclose(fout);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -