📄 datasource.cpp

📁 关联分类算法采用贪心算法发现高质量分类规则
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
// DataSource.cpp: implementation of the DataSource class.////////////////////////////////////////////////////////////////////////#include "StdAfx.h"#include <algorithm>#include <assert.h>#include "global.h"#include "HashTable.h"#include "DESCRIPTION.h"#include "TUPLE.h"#include "LITERAL.h"#include "RULE.h"#include "RULESET.h"#include "AGGREGATES.h"#include "DataSource.h"#define LOG2	0.6931472//////////////////////////////////////////////////////////////////////// Construction/Destruction//////////////////////////////////////////////////////////////////////DataSource::DataSource(){	m_nTuples=0;	m_Tuples=NULL;	m_nRules=0;	m_Rules=NULL;	m_nTupleInClass=NULL;}DataSource::~DataSource(){	if(m_Tuples) delete[] m_Tuples;	if(m_Rules) delete[] m_Rules;	if(m_nTupleInClass) delete[] m_nTupleInClass;}void DataSource::ResetAllWeights(){	for(int i=0;i<m_nTuples;i++)		m_Tuples[i].weight=1.0;}double DataSource::TotalWeights(){	double result=0;	for(int i=0;i<m_nTuples;i++)		result+=m_Tuples[i].weight;	return result;}double DataSource::TotalWeights(const std::list<TUPLE*>& list1){	double result=0;	for(std::list<TUPLE*>::const_iterator it = list1.begin(); it != list1.end(); it++) {		result += (*it)->weight;	}	return result;}bool DataSource::ReadData(CString filename){	FILE *fin=fopen(filename,"r");	if(fin==NULL) return false;	m_Descript.ReadFrom(fin);	m_nTupleInClass=new int[m_Descript.GetNumClasses()];	memset(m_nTupleInClass,0,sizeof(int)*m_Descript.GetNumClasses());	fscanf(fin,"%d\n",&m_nTuples);	m_Tuples=new TUPLE[m_nTuples];	for(int iTuple=0;iTuple<m_nTuples;iTuple++)	{		int cls;		fscanf(fin,"%d ",&cls);		m_Tuples[iTuple].Initialize(m_Descript);		m_Tuples[iTuple].cls=cls;		for(int iAttr=0;iAttr<m_Descript.GetNumAttr();iAttr++)		{			int val;			fscanf(fin,"%d ",&val);			//The original data is 1 based. Now I change it into 0 based.//			val--;				if(val>=0&&val<m_Descript.GetNumValues(iAttr))			{				m_Tuples[iTuple].attr[iAttr]=val;			}			else			{				m_Tuples[iTuple].attr[iAttr]=0;			}		}		m_nTupleInClass[cls]++;		fscanf(fin,"\n");	}	fclose(fin);	return true;}bool DataSource::ReadRules(CString filename){	if(m_Rules!=NULL) delete[] m_Rules;	FILE* fin=fopen(filename,"r");	if(fin==NULL) return false;	m_nRules=count_num_lines(fin);	fclose(fin);	m_Rules=new RULE[m_nRules];	fin=fopen(filename,"r");	for(int i=0;i<m_nRules;i++)	{		m_Rules[i].ReadFrom(fin);	}	fclose(fin);	return true;}double DataSource::EvaluateRule(const RULE& rule,bool expected_error) const//return expected precision{	int nClasses=m_Descript.GetNumClasses();	int m=0,p=0;	//m is the number of examples satisfying rule, among which p is the number of examples correctly classified	for(int i=0;i<m_nTuples;i++)	{		if(rule.Satisfy(m_Tuples[i]))		{			m++;			if(rule.GetClass()==m_Tuples[i].cls) p++;		}	}	if(expected_error)	{//		double err=error_rate(m,p);		return (p+1)/(double)(m+nClasses);	}	else	{		return (double)p/m;	}/*	//chi square	int nCls=m_Descript.GetNumClasses();	int *nTupleCls=new int[nCls];	memset(nTupleCls,0,sizeof(int)*nCls);	for(int i=0;i<m_nTuples;i++)	{		int cls=m_Tuples[i].cls;		if(rule.Satisfy(m_Tuples[i]))		{			nTupleCls[cls]++;		}	}		double chi2=0;	for(i=0;i<nCls;i++)	{		if(m_nTupleInClass[i]==0) continue;		chi2+= (nTupleCls[i]-m_nTupleInClass[i]) * (nTupleCls[i]-m_nTupleInClass[i]) / (double)m_nTupleInClass[i];	}	double sup_P=nTupleCls[rule.GetClass()];	double sup_c=m_nTupleInClass[rule.GetClass()];	double T=m_nTuples;	double e = 1.0/(sup_P*sup_c) + 1.0/(sup_P*(T-sup_c)) + 1.0/((T-sup_P)*sup_c) + 1.0/((T-sup_P)*(T-sup_c));	double temp=__min(sup_P,sup_c)-sup_P*sup_c/T;	double max_chi2=temp*temp*T*e;	return chi2*chi2/max_chi2;*/}double DataSource::EvaluateRule(const RULE& rule,const std::list<TUPLE*>& P,const std::list<TUPLE*>& N,bool expected_error) const{	int nClasses=m_Descript.GetNumClasses();	//using weight	double m=0,p=0;	for(std::list<TUPLE*>::const_iterator pos=P.begin(); pos!=P.end(); pos++)	{		if(rule.Satisfy(**pos))		{			m+=(**pos).weight;			p+=(**pos).weight;		}	}	for(std::list<TUPLE*>::const_iterator pos=N.begin(); pos!=N.end(); pos++)	{		if(rule.Satisfy(**pos))			m+=(**pos).weight;	}	if(expected_error)	{//		double err=error_rate((int)(m+0.5),(int)(p+0.5));		return (p+1)/(double)(m+nClasses);	}	else	{		return (double)p/m;	}}void DataSource::EvaluateRule(const RULE& rule,int& sup,double& conf,double& lift){	int m=0,p=0;	//m is the number of examples satisfying rule, among which p is the number of examples correctly classified	for(int i=0;i<m_nTuples;i++)	{		if(rule.Satisfy(m_Tuples[i]))		{			m++;			if(rule.GetClass()==m_Tuples[i].cls) p++;		}	}	sup=p;	conf=(double)p/m;	lift= conf / (m_nTupleInClass[rule.GetClass()]/(double)m_nTuples);}void DataSource::GenAllLiterals(std::list<LITERAL>& llist){	llist.clear();	for(int iAttr=0;iAttr<m_Descript.GetNumAttr();iAttr++)	{				for(int iVal=0;iVal<m_Descript.GetNumValues(iAttr);iVal++)		{			LITERAL lit;			lit.Set(iAttr,iVal);			llist.push_back(lit);		}	}}double DataSource::EvaluateLiteral(const LITERAL& lit,const std::list<TUPLE*>& P,const std::list<TUPLE*>& N) const{//	Using weight	double nP=0,nN=0,nP_=0,nN_=0;	for(std::list<TUPLE*>::const_iterator pos=P.begin(); pos!=P.end(); pos++)	{		double weight=(**pos).weight;		nP+=weight;		if(lit.Satisfy(**pos)) 			nP_+=weight;	}	for(std::list<TUPLE*>::const_iterator pos=N.begin();pos!=N.end(); pos++)	{		double weight=(**pos).weight;		nN+=weight;		if(lit.Satisfy(**pos))			nN_+=weight;	}	double old_entropy=-log((double)nP/(nP+nN))/LOG2;	double new_entropy=-log((double)nP_/(nP_+nN_))/LOG2;	return nP_*(old_entropy-new_entropy);}double DataSource::EvaluateLiteral(double nP,double nN,double nP_,double nN_) const{	if(nP_<0.1) return 0.0;	double old_entropy=-log((double)nP/(nP+nN))/LOG2;	double new_entropy=-log((double)nP_/(nP_+nN_))/LOG2;	return nP_*(old_entropy-new_entropy);}void DataSource::GenRulesFOIL(int iClass,RULESET& Rules){	/*		information needed to be maintained during the rule finding process:		total_aggr,AP,total_weight.	*/	ResetAllWeights();	AGGREGATES	total_aggr;	total_aggr.Initialize(m_Descript);	std::list<TUPLE*> AP,AN;	int nAttr=m_Descript.num_attr;	for(int i=0;i<m_nTuples;i++)	{		int isP=0;		if(m_Tuples[i].cls==iClass)		{			isP=1;			AP.push_back(&(m_Tuples[i]));		}		else		{			AN.push_back(&(m_Tuples[i]));		}		for(int j=0;j<m_Descript.num_attr;j++)		{			total_aggr.m_Data[j][m_Tuples[i].attr[j]][isP]++;		}	}	double ori_total_weight=AP.size();	double total_weight=ori_total_weight;	RULESET temp_ruleset;		while(total_weight>ori_total_weight/20)	{		//begin searching for a rule		int i;		std::list<TUPLE*>::iterator pos,old_pos;		/*			information needed to be maintained during the literal finding process:			aggr,P,N,nP,nN,R.		*/		/*			initialization			aggr:aggragtes for each literal			P,N:positives and negatives in searching for the rule			nP,nN:total weights of P and N		*/		AGGREGATES aggr;		std::list<TUPLE*> P,N;		double nP=0,nN=0;		RULE R;		if(m_RuleStack.empty())		{			aggr=total_aggr;			for(pos=AP.begin(); pos!=AP.end(); pos++)				P.push_back(*pos);			for(pos=AN.begin(); pos!=AN.end(); pos++)				N.push_back(*pos);						nP=total_weight;			nN=N.size();			R.SetClass(iClass);		}		else		{			aggr.Initialize(m_Descript);			R=m_RuleStack.front();			m_RuleStack.pop_front();			//calculate P and N according to R			for(pos=AP.begin();pos!=AP.end();pos++)			{				TUPLE* pTuple=*pos;				if(R.Satisfy(*pTuple))				{					P.push_back(pTuple);					nP+=pTuple->weight;				}			}			for(pos=AN.begin();pos!=AN.end();pos++)			{				TUPLE* pTuple=*pos;				if(R.Satisfy(*pTuple))				{					N.push_back(pTuple);				}			}			nN=N.size();			//calculate aggr according to P and N			for(pos=P.begin();pos!=P.end();pos++)			{				TUPLE* pTuple=*pos;				for(int j=0;j<m_Descript.num_attr;j++)				{					aggr.m_Data[j][pTuple->attr[j]][1]+=pTuple->weight;				}			}			for(pos=N.begin();pos!=N.end();pos++)			{				TUPLE* pTuple=*pos;				for(int j=0;j<m_Descript.num_attr;j++)				{					aggr.m_Data[j][pTuple->attr[j]][0]+=pTuple->weight;				}			}		}		//each loop searches for the next literal		while(1)		{			int i,j;			std::list<TUPLE*>::iterator pos,old_pos;			TUPLE* pTuple;			//select best literal			double max_gain=-10000000.0;			LITERAL lit;			for(i=0;i<m_Descript.num_attr;i++)			{				for(j=0;j<m_Descript.num_values[i];j++)				{					double nP_=aggr.m_Data[i][j][1];					double nN_=aggr.m_Data[i][j][0];					double gain=EvaluateLiteral(nP,nN,nP_,nN_);					if(gain>max_gain)					{						max_gain=gain;						lit.Set(i,j);					}				}			}			 			if(max_gain<0.7) break;			bool first_time=true;			for(i=0;i<m_Descript.num_attr;i++)			{				for(j=0;j<m_Descript.num_values[i];j++)				{					double nP_=aggr.m_Data[i][j][1];					double nN_=aggr.m_Data[i][j][0];					double gain=EvaluateLiteral(nP,nN,nP_,nN_);					if(gain>max_gain*0.99)					{						if(first_time)						{							lit.Set(i,j);							first_time=false;							continue;						}						LITERAL new_lit;						new_lit.Set(i,j);						RULE r=R;						r.AddLiteral(new_lit);						m_RuleStack.push_front(r);//						r.WriteTo(stdout);					}				}			}						R.AddLiteral(lit);			//remove all tuples from P and N			std::list<TUPLE*> P_removed,N_removed;			pos=P.begin();			while(pos!=P.end())			{				pTuple=*pos;				if(pTuple->attr[lit.m_Var]!=lit.m_Value)				{					old_pos=pos;					pos++;					P.erase(old_pos);					P_removed.push_back(pTuple);				}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -