📄 datasource.cpp
字号:
// DataSource.cpp: implementation of the DataSource class.////////////////////////////////////////////////////////////////////////#include "StdAfx.h"#include <algorithm>#include <assert.h>#include "global.h"#include "HashTable.h"#include "DESCRIPTION.h"#include "TUPLE.h"#include "LITERAL.h"#include "RULE.h"#include "RULESET.h"#include "AGGREGATES.h"#include "DataSource.h"#define LOG2 0.6931472//////////////////////////////////////////////////////////////////////// Construction/Destruction//////////////////////////////////////////////////////////////////////DataSource::DataSource(){ m_nTuples=0; m_Tuples=NULL; m_nRules=0; m_Rules=NULL; m_nTupleInClass=NULL;}DataSource::~DataSource(){ if(m_Tuples) delete[] m_Tuples; if(m_Rules) delete[] m_Rules; if(m_nTupleInClass) delete[] m_nTupleInClass;}void DataSource::ResetAllWeights(){ for(int i=0;i<m_nTuples;i++) m_Tuples[i].weight=1.0;}double DataSource::TotalWeights(){ double result=0; for(int i=0;i<m_nTuples;i++) result+=m_Tuples[i].weight; return result;}double DataSource::TotalWeights(const std::list<TUPLE*>& list1){ double result=0; for(std::list<TUPLE*>::const_iterator it = list1.begin(); it != list1.end(); it++) { result += (*it)->weight; } return result;}bool DataSource::ReadData(CString filename){ FILE *fin=fopen(filename,"r"); if(fin==NULL) return false; m_Descript.ReadFrom(fin); m_nTupleInClass=new int[m_Descript.GetNumClasses()]; memset(m_nTupleInClass,0,sizeof(int)*m_Descript.GetNumClasses()); fscanf(fin,"%d\n",&m_nTuples); m_Tuples=new TUPLE[m_nTuples]; for(int iTuple=0;iTuple<m_nTuples;iTuple++) { int cls; fscanf(fin,"%d ",&cls); m_Tuples[iTuple].Initialize(m_Descript); m_Tuples[iTuple].cls=cls; for(int iAttr=0;iAttr<m_Descript.GetNumAttr();iAttr++) { int val; fscanf(fin,"%d ",&val); //The original data is 1 based. Now I change it into 0 based.// val--; if(val>=0&&val<m_Descript.GetNumValues(iAttr)) { m_Tuples[iTuple].attr[iAttr]=val; } else { m_Tuples[iTuple].attr[iAttr]=0; } } m_nTupleInClass[cls]++; fscanf(fin,"\n"); } fclose(fin); return true;}bool DataSource::ReadRules(CString filename){ if(m_Rules!=NULL) delete[] m_Rules; FILE* fin=fopen(filename,"r"); if(fin==NULL) return false; m_nRules=count_num_lines(fin); fclose(fin); m_Rules=new RULE[m_nRules]; fin=fopen(filename,"r"); for(int i=0;i<m_nRules;i++) { m_Rules[i].ReadFrom(fin); } fclose(fin); return true;}double DataSource::EvaluateRule(const RULE& rule,bool expected_error) const//return expected precision{ int nClasses=m_Descript.GetNumClasses(); int m=0,p=0; //m is the number of examples satisfying rule, among which p is the number of examples correctly classified for(int i=0;i<m_nTuples;i++) { if(rule.Satisfy(m_Tuples[i])) { m++; if(rule.GetClass()==m_Tuples[i].cls) p++; } } if(expected_error) {// double err=error_rate(m,p); return (p+1)/(double)(m+nClasses); } else { return (double)p/m; }/* //chi square int nCls=m_Descript.GetNumClasses(); int *nTupleCls=new int[nCls]; memset(nTupleCls,0,sizeof(int)*nCls); for(int i=0;i<m_nTuples;i++) { int cls=m_Tuples[i].cls; if(rule.Satisfy(m_Tuples[i])) { nTupleCls[cls]++; } } double chi2=0; for(i=0;i<nCls;i++) { if(m_nTupleInClass[i]==0) continue; chi2+= (nTupleCls[i]-m_nTupleInClass[i]) * (nTupleCls[i]-m_nTupleInClass[i]) / (double)m_nTupleInClass[i]; } double sup_P=nTupleCls[rule.GetClass()]; double sup_c=m_nTupleInClass[rule.GetClass()]; double T=m_nTuples; double e = 1.0/(sup_P*sup_c) + 1.0/(sup_P*(T-sup_c)) + 1.0/((T-sup_P)*sup_c) + 1.0/((T-sup_P)*(T-sup_c)); double temp=__min(sup_P,sup_c)-sup_P*sup_c/T; double max_chi2=temp*temp*T*e; return chi2*chi2/max_chi2;*/}double DataSource::EvaluateRule(const RULE& rule,const std::list<TUPLE*>& P,const std::list<TUPLE*>& N,bool expected_error) const{ int nClasses=m_Descript.GetNumClasses(); //using weight double m=0,p=0; for(std::list<TUPLE*>::const_iterator pos=P.begin(); pos!=P.end(); pos++) { if(rule.Satisfy(**pos)) { m+=(**pos).weight; p+=(**pos).weight; } } for(std::list<TUPLE*>::const_iterator pos=N.begin(); pos!=N.end(); pos++) { if(rule.Satisfy(**pos)) m+=(**pos).weight; } if(expected_error) {// double err=error_rate((int)(m+0.5),(int)(p+0.5)); return (p+1)/(double)(m+nClasses); } else { return (double)p/m; }}void DataSource::EvaluateRule(const RULE& rule,int& sup,double& conf,double& lift){ int m=0,p=0; //m is the number of examples satisfying rule, among which p is the number of examples correctly classified for(int i=0;i<m_nTuples;i++) { if(rule.Satisfy(m_Tuples[i])) { m++; if(rule.GetClass()==m_Tuples[i].cls) p++; } } sup=p; conf=(double)p/m; lift= conf / (m_nTupleInClass[rule.GetClass()]/(double)m_nTuples);}void DataSource::GenAllLiterals(std::list<LITERAL>& llist){ llist.clear(); for(int iAttr=0;iAttr<m_Descript.GetNumAttr();iAttr++) { for(int iVal=0;iVal<m_Descript.GetNumValues(iAttr);iVal++) { LITERAL lit; lit.Set(iAttr,iVal); llist.push_back(lit); } }}double DataSource::EvaluateLiteral(const LITERAL& lit,const std::list<TUPLE*>& P,const std::list<TUPLE*>& N) const{// Using weight double nP=0,nN=0,nP_=0,nN_=0; for(std::list<TUPLE*>::const_iterator pos=P.begin(); pos!=P.end(); pos++) { double weight=(**pos).weight; nP+=weight; if(lit.Satisfy(**pos)) nP_+=weight; } for(std::list<TUPLE*>::const_iterator pos=N.begin();pos!=N.end(); pos++) { double weight=(**pos).weight; nN+=weight; if(lit.Satisfy(**pos)) nN_+=weight; } double old_entropy=-log((double)nP/(nP+nN))/LOG2; double new_entropy=-log((double)nP_/(nP_+nN_))/LOG2; return nP_*(old_entropy-new_entropy);}double DataSource::EvaluateLiteral(double nP,double nN,double nP_,double nN_) const{ if(nP_<0.1) return 0.0; double old_entropy=-log((double)nP/(nP+nN))/LOG2; double new_entropy=-log((double)nP_/(nP_+nN_))/LOG2; return nP_*(old_entropy-new_entropy);}void DataSource::GenRulesFOIL(int iClass,RULESET& Rules){ /* information needed to be maintained during the rule finding process: total_aggr,AP,total_weight. */ ResetAllWeights(); AGGREGATES total_aggr; total_aggr.Initialize(m_Descript); std::list<TUPLE*> AP,AN; int nAttr=m_Descript.num_attr; for(int i=0;i<m_nTuples;i++) { int isP=0; if(m_Tuples[i].cls==iClass) { isP=1; AP.push_back(&(m_Tuples[i])); } else { AN.push_back(&(m_Tuples[i])); } for(int j=0;j<m_Descript.num_attr;j++) { total_aggr.m_Data[j][m_Tuples[i].attr[j]][isP]++; } } double ori_total_weight=AP.size(); double total_weight=ori_total_weight; RULESET temp_ruleset; while(total_weight>ori_total_weight/20) { //begin searching for a rule int i; std::list<TUPLE*>::iterator pos,old_pos; /* information needed to be maintained during the literal finding process: aggr,P,N,nP,nN,R. */ /* initialization aggr:aggragtes for each literal P,N:positives and negatives in searching for the rule nP,nN:total weights of P and N */ AGGREGATES aggr; std::list<TUPLE*> P,N; double nP=0,nN=0; RULE R; if(m_RuleStack.empty()) { aggr=total_aggr; for(pos=AP.begin(); pos!=AP.end(); pos++) P.push_back(*pos); for(pos=AN.begin(); pos!=AN.end(); pos++) N.push_back(*pos); nP=total_weight; nN=N.size(); R.SetClass(iClass); } else { aggr.Initialize(m_Descript); R=m_RuleStack.front(); m_RuleStack.pop_front(); //calculate P and N according to R for(pos=AP.begin();pos!=AP.end();pos++) { TUPLE* pTuple=*pos; if(R.Satisfy(*pTuple)) { P.push_back(pTuple); nP+=pTuple->weight; } } for(pos=AN.begin();pos!=AN.end();pos++) { TUPLE* pTuple=*pos; if(R.Satisfy(*pTuple)) { N.push_back(pTuple); } } nN=N.size(); //calculate aggr according to P and N for(pos=P.begin();pos!=P.end();pos++) { TUPLE* pTuple=*pos; for(int j=0;j<m_Descript.num_attr;j++) { aggr.m_Data[j][pTuple->attr[j]][1]+=pTuple->weight; } } for(pos=N.begin();pos!=N.end();pos++) { TUPLE* pTuple=*pos; for(int j=0;j<m_Descript.num_attr;j++) { aggr.m_Data[j][pTuple->attr[j]][0]+=pTuple->weight; } } } //each loop searches for the next literal while(1) { int i,j; std::list<TUPLE*>::iterator pos,old_pos; TUPLE* pTuple; //select best literal double max_gain=-10000000.0; LITERAL lit; for(i=0;i<m_Descript.num_attr;i++) { for(j=0;j<m_Descript.num_values[i];j++) { double nP_=aggr.m_Data[i][j][1]; double nN_=aggr.m_Data[i][j][0]; double gain=EvaluateLiteral(nP,nN,nP_,nN_); if(gain>max_gain) { max_gain=gain; lit.Set(i,j); } } } if(max_gain<0.7) break; bool first_time=true; for(i=0;i<m_Descript.num_attr;i++) { for(j=0;j<m_Descript.num_values[i];j++) { double nP_=aggr.m_Data[i][j][1]; double nN_=aggr.m_Data[i][j][0]; double gain=EvaluateLiteral(nP,nN,nP_,nN_); if(gain>max_gain*0.99) { if(first_time) { lit.Set(i,j); first_time=false; continue; } LITERAL new_lit; new_lit.Set(i,j); RULE r=R; r.AddLiteral(new_lit); m_RuleStack.push_front(r);// r.WriteTo(stdout); } } } R.AddLiteral(lit); //remove all tuples from P and N std::list<TUPLE*> P_removed,N_removed; pos=P.begin(); while(pos!=P.end()) { pTuple=*pos; if(pTuple->attr[lit.m_Var]!=lit.m_Value) { old_pos=pos; pos++; P.erase(old_pos); P_removed.push_back(pTuple); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -