⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 datasource.cpp

📁 关联分类算法采用贪心算法发现高质量分类规则
💻 CPP
📖 第 1 页 / 共 2 页
字号:
				else				{					pos++;				}			}			pos=N.begin();			while(pos!=N.end())			{				pTuple=*pos;				if(pTuple->attr[lit.m_Var]!=lit.m_Value)				{					old_pos=pos;					pos++;					N.erase(old_pos);					N_removed.push_back(pTuple);				}				else				{					pos++;				}			}						//if most of the tuples are removed, we recalculate aggr by tuples remained, 			//otherwise we update aggr by tuples removed.			if(P.size()+N.size()<=P_removed.size()+N_removed.size())			{				aggr.Initialize(m_Descript);				nP=0;				nN=0;				for(pos=P.begin();pos!=P.end();pos++)				{					pTuple=*pos;					nP+=pTuple->weight;					for(i=0;i<nAttr;i++)						aggr.m_Data[i][pTuple->attr[i]][1]+=pTuple->weight;				}				for(pos=N.begin();pos!=N.end();pos++)				{					pTuple=*pos;					nN+=1.0;					for(i=0;i<nAttr;i++)						aggr.m_Data[i][pTuple->attr[i]][0]+=1.0;	//negative example always has weight 1.0				}			}			else			{				for(pos=P_removed.begin();pos!=P_removed.end();pos++)				{					pTuple=*pos;					nP-=pTuple->weight;					for(i=0;i<nAttr;i++)						aggr.m_Data[i][pTuple->attr[i]][1]-=pTuple->weight;				}				for(pos=N_removed.begin();pos!=N_removed.end(); pos++)				{					pTuple=*pos;					nN-=1.0;					for(i=0;i<nAttr;i++)						aggr.m_Data[i][pTuple->attr[i]][0]-=1.0;				}			}		}//end of searching for a rule				if(R.GetSize()>0) temp_ruleset.push_back(R);//		printf("--");//		R.WriteTo(stdout);		if(m_RuleStack.empty())		{			if(temp_ruleset.empty()) break;			for(RULESET::iterator pos1=temp_ruleset.begin();pos1!=temp_ruleset.end();pos1++)			{				R=*pos1;				Rules.push_back(R);				//decrease the weights of all postive tuples satisfying R. Remove it if its weight<0.1.				TUPLE* pTuple;				pos=AP.begin();				while(pos!=AP.end())				{					bool removed=false;					pTuple=*pos;					if(R.Satisfy(*pTuple))					{						double weight_lost;						double old_weight=pTuple->weight;						pTuple->weight*=0.66667;						weight_lost=old_weight-pTuple->weight;												if(pTuple->weight<0)						{							removed=true;							old_pos=pos;							pos++;							AP.erase(old_pos);							weight_lost=old_weight;						}												//update total_weight and total_aggr						total_weight-=weight_lost;						for(i=0;i<nAttr;i++)							total_aggr.m_Data[i][pTuple->attr[i]][1]-=weight_lost;											}					if(!removed) ++pos;				}			}			temp_ruleset.clear();		}	}//end of searching for all rules}int DataSource::GenRulesFOIL(CString outputfilename)//return number of rules{/*	RULESET Rules,Rules2,Rules3;		for(int iClass=0;iClass<m_Descript.GetNumClasses();iClass++)	{		GenRulesFOIL(iClass,Rules);	}	for(POSITION pos=Rules.GetHeadPosition();pos!=NULL;Rules.GetNext(pos))	{		RULE rule=Rules.GetAt(pos);		double gain=EvaluateRule(rule,false);		rule.SetInfoGain(gain);		Rules3.AddRuleByGain(rule);	}	RULE newrule;	for(pos=Rules3.GetHeadPosition();pos!=NULL;Rules3.GetNext(pos))	{		RULE rule=Rules3.GetAt(pos);		double gain=EvaluateRule(rule,false);		rule.SetInfoGain(gain);		if(rule==newrule)		{			newrule.SetInfoGain(newrule.GetInfoGain()+rule.GetInfoGain());		}		else		{			if(newrule.GetSize()>0)				Rules2.AddRuleByGain(newrule);			newrule=rule;		}	}	if(newrule.GetSize()>0)		Rules2.AddRuleByGain(newrule);	Rules2.WriteTo(outputfilename);	return Rules2.size();*/	RULESET Rules,Rules2,Rules3;		for(int iClass=0;iClass<m_Descript.GetNumClasses();iClass++)	{		GenRulesFOIL(iClass,Rules);	}	for(RULESET::iterator pos=Rules.begin();pos!=Rules.end();pos++)	{		RULE rule=*pos;		double gain=EvaluateRule(rule);		rule.SetInfoGain(gain);		Rules3.AddRule(rule);	}	RULE newrule;	for(RULESET::iterator pos=Rules3.begin();pos!=Rules3.end();pos++)	{		RULE rule=*pos;		Rules2.AddRuleByGain(rule);	}	Rules2.WriteTo(outputfilename);	return Rules2.size();}int DataSource::Classify(const TUPLE& tuple,int &iRule) const{/*//	Classify with best single rule	double max_conf=-1.0;	int res_class=0;	for(int i=0;i<m_nRules;i++)	{		if(m_Rules[i].Satisfy(tuple))		{			if(m_Rules[i].GetInfoGain()>max_conf)			{				max_conf=m_Rules[i].GetInfoGain();				res_class=m_Rules[i].GetClass();			}		}	}	return res_class;*///	Vote among rules//	double m=m_nRules/(double)(m_Descript.GetNumClasses());//	double a=1-exp(-4.0/m);//	int nChosen=(int)(2*m*a+0.5);	int i;	double max_conf=-1.0;	int res_class=0;	double* conf=new double[m_Descript.GetNumClasses()];	double* high_conf=new double[m_Descript.GetNumClasses()];	int* i_rule=new int[m_Descript.GetNumClasses()];	int* count=new int[m_Descript.GetNumClasses()];	for(i=0;i<m_Descript.GetNumClasses();i++)	{		conf[i]=0;		high_conf[i]=0;		i_rule[i]=-1;		count[i]=0;	}	for(i=0;i<m_nRules;i++)	{		if(m_Rules[i].Satisfy(tuple))		{			int cls=m_Rules[i].GetClass();			double gain=m_Rules[i].GetInfoGain();			double c=conf[cls];			if(count[cls]<5)			{//				conf[cls]=1-(1-c)*(1-m_Rules[i].GetInfoGain());				conf[cls]+=m_Rules[i].GetInfoGain();				count[cls]++;			}			if(gain>high_conf[cls])			{				high_conf[cls]=gain;				i_rule[cls]=i;			}		}	}	for(i=0;i<m_Descript.GetNumClasses();i++)	{		if(conf[i]>max_conf)		{			res_class=i;			max_conf=conf[i];		}	}	iRule=i_rule[res_class];	delete conf;	delete high_conf;	delete i_rule;	delete count;	return res_class;}double DataSource::Classify(CString outputfilename) const{	int nClasses=m_Descript.GetNumClasses();	int nCorrect=0;	int* nTuples=new int[nClasses];	int* nTruePositive=new int[nClasses];;	int* nFalsePositive=new int[nClasses];	memset(nTuples,0,sizeof(int)*nClasses);	memset(nTruePositive,0,sizeof(int)*nClasses);	memset(nFalsePositive,0,sizeof(int)*nClasses);	FILE* out=fopen(outputfilename,"w");	fprintf(out,"TID  TRUE_LABEL  PRED_LABEL  RULE_IDX  RULE_SCORE\n");	for(int iTuple=0;iTuple<m_nTuples;iTuple++)	{		int label_class=m_Tuples[iTuple].cls;		nTuples[label_class]++;				int iRule=0;		int pred_class=Classify(m_Tuples[iTuple],iRule);		if(pred_class==label_class)		{			nCorrect++;			nTruePositive[label_class]++;		}		else		{			nFalsePositive[pred_class]++;		}		fprintf(out,"%d\t%d\t%d\t%d\t%.3f\n",iTuple,label_class,pred_class,iRule,m_Rules[iRule].GetInfoGain());	}	fprintf(out,"\nAccuracy: %.2f%%\n",100.0*nCorrect/m_nTuples);	printf("Accuracy: %.2f%%\n",100.0*nCorrect/m_nTuples);	for(int iClass=0;iClass<nClasses;iClass++)	{		fprintf(out,"CLASS %d: #Tuples %d precision %.2f%%, recall %.2f%%\n",iClass,nTuples[iClass],100.0*nTruePositive[iClass]/(nTruePositive[iClass]+nFalsePositive[iClass]),100.0*nTruePositive[iClass]/nTuples[iClass]);	}	fclose(out);	delete nTuples;	delete nTruePositive;	delete nFalsePositive;	return nCorrect/(double)m_nTuples;}int compare_rule(const void* elem1,const void* elem2){	RULE** pRule1=(RULE**)elem1;	RULE** pRule2=(RULE**)elem2;	if((*pRule1)->GetInfoGain()>(*pRule2)->GetInfoGain())		return -1;	else return 1;}void DataSource::EvaluateRules(CString outputfile){	FILE* fout=fopen(outputfile,"w");	int sup;	double conf,lift;	int nClasses=m_Descript.GetNumClasses();	for(int iClass=0;iClass<nClasses;iClass++)	{		fprintf(fout,"\nRules for class %d\n",iClass);		for(int iRule=0;iRule<m_nRules;iRule++)		{			if(m_Rules[iRule].GetClass()==iClass)			{				EvaluateRule(m_Rules[iRule],sup,conf,lift);				fprintf(fout,"sup=%d,conf=%.3f,lift=%.3f ",sup,conf,lift);				m_Rules[iRule].WriteTo(fout);			}		}	}	fclose(fout);}void DataSource::IntepretRules(CString rule_semantics_file,CString outputfilename) const{	vector<CString>* Semantics=new vector<CString>[m_Descript.GetNumAttr()];	vector<CString>  Class_labels;	int temp;	FILE* fin=fopen(rule_semantics_file,"r");	fscanf(fin,"%d attributes\n",&temp);	assert(temp==m_Descript.GetNumAttr());		char buf[200];	for(int i=0;i<m_Descript.GetNumAttr();i++)	{		int iAttr,nValue;		fscanf(fin,"\nattribute%d %d %s\n",&iAttr,&nValue,buf);		assert(iAttr==i+1&&nValue==m_Descript.GetNumValues(i));		CString attr_name=buf;		Semantics[i].push_back(attr_name);		for(int j=0;j<nValue;j++)		{			fgets(buf,200,fin);			CString value_name=buf;			int len=value_name.GetLength();			if(value_name.GetAt(len-1)=='\n')				value_name=value_name.Left(len-1);			Semantics[i].push_back(value_name);		}	}	int nClasses;	fscanf(fin,"\nclass_label %d\n",&nClasses);	assert(nClasses==m_Descript.num_classes);	for(int i=0;i<nClasses;i++)	{		fgets(buf,200,fin);		CString class_label=buf;		int len=class_label.GetLength();		if(class_label.GetAt(len-1)=='\n')			class_label=class_label.Left(len-1);		Class_labels.push_back(class_label);	}	fclose(fin);	RULE** pRules=new RULE*[m_nRules];	for(int i=0;i<m_nRules;i++)		pRules[i]=&(m_Rules[i]);	qsort(pRules,m_nRules,sizeof(RULE*),compare_rule);	FILE* fout=fopen(outputfilename,"w");	for(int i=0;i<m_nRules;i++)	{		RULE rule=*pRules[i];		int pos = rule.GetClass();		fprintf(fout,"%s (%.3f) :- ", (const char*) Class_labels[pos], rule.GetInfoGain());		for(int j=0;j<rule.GetSize();j++)		{			LITERAL lit=rule.GetLiteral(j);			CString attr_name=Semantics[lit.GetVar()][0];			int pos=lit.GetValue()+1;			CString value_name=Semantics[lit.GetVar()][pos];			fprintf(fout,"%s:%s, ", (const char*) attr_name, (const char*) value_name);		}		fprintf(fout,"\n");	}	fclose(fout);	delete[] Semantics;	delete pRules;}void DataSource::GenAccrCurve(CString datafile,CString rulefile,CString outputfile){	ReadData(datafile);	ReadRules(rulefile);	int nAllRule=m_nRules;	int nClass=m_Descript.GetNumClasses();	int step=nAllRule/nClass/50;	if (step<1) step = 1;	int maxNumRule=0;	for(int iClass=0;iClass<nClass;iClass++)	{		int count=0;		for(int iRule=0;iRule<nAllRule;iRule++)		{			if(m_Rules[iRule].GetClass()==iClass) count++;		}		if(count>maxNumRule) maxNumRule=count;	}	DataSource dsrc;	dsrc.ReadData(datafile);		FILE* fout=fopen(outputfile,"w");	for(int nRule=1;nRule<maxNumRule;nRule+=step)	{		FILE* frule=fopen("rule.txt","w");		for(int iClass=0;iClass<nClass;iClass++)		{			int count=0;			for(int iRule=0;iRule<nAllRule;iRule++)			{				if(m_Rules[iRule].GetClass()==iClass)				{					m_Rules[iRule].WriteTo(frule);					count++;					if(count==nRule) break;				}			}		}		fclose(frule);		dsrc.ReadRules("rule.txt");		double accr=dsrc.Classify("temp.txt");		fprintf(fout,"%d rules: %.4f\n",nRule,accr);	}	fclose(fout);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -