📄 doclist.cpp

📁 knn和Native Bayes算法实现,两个实现在一起,是数据挖掘和机器学习中的内容.
💻 CPP
字号:
#include "stdafx.h"

#pragma warning( disable : 4786 )

#include "assert.h"
#include "doclist.h"



void *my_malloc(size_t size)
{
	void *ptr;
	ptr=(void *)malloc(size);
	if(!ptr) { 
		perror ("Out of memory!\n"); 
		exit (1); 
	}
	return(ptr);
}


//compute the inner product of two sparse vectors
//if one vector is empty, that ai a[0].wnum=0,the function return 0
float sprod_ss(WORDITEM *a, WORDITEM *b) 
{
	register float sum=0;
	register WORDITEM *ai,*bj;

	ai=a;
	bj=b;
	while (ai->wnum && bj->wnum) 
	{
		if(ai->wnum > bj->wnum) 
		{
			bj++;
		}
		else if (ai->wnum < bj->wnum) 
		{
			ai++;
		}
		else 
		{
			sum+=ai->weight * bj->weight;
			ai++;
			bj++;
		}
	}

	return(sum);
}

// Grep through file and count number of lines, maximum number of spaces per line, and longest line
void nol_ll(char *file, long& nol, long& wol, long& ll) 
{
	FILE *fl;
	int ic;
	char c;
	long current_length,current_wol;

	if ((fl = fopen (file, "r")) == NULL)
	{
		perror (file); 
		exit (1); 
	}

	current_length = 0;
	current_wol = 0;
	ll = 0;
	nol = 0;
	wol = 0;

	while((ic=getc(fl)) != EOF) 
	{
		c=(char)ic;
		current_length++;

		if(isspace((int)c)) 
		{
			current_wol++;
		}

		if(c == '\n') 
		{
			nol++;

			if(current_length > ll) 
			{
				ll = current_length;
			}

			if(current_wol > wol) 
			{
				wol = current_wol;
			}

			current_length = 0;
			current_wol = 0;
		}
	}

	cout << endl;

	fclose(fl);
}

int parse_document(int iParseFlag,char *line, DOC& doc, char *label, long& numwords,long max_words)
{	
	int ret = 1;
	register long wpos;
	unsigned long pos;
	long wnum;
	char chDocId[256];
	static long	lLineIndex = 1;

	double weight;
	int numread;
	char featurepair[MAX_DOC_FEATURE],junk[MAX_DOC_FEATURE];

	doc.queryid=0;
	doc.costfactor=1;

	pos=0;
	while(line[pos]) 
	{
		// cut off comments
		if(line[pos] == '#') 
		{
			line[pos]=0;
		}
		else 
		{
			pos++;
		}
	}

	wpos=0;
	pos = 0;
	if (iParseFlag & PARSE_ID_FLAG)
	{
		if(sscanf(line,"%s",chDocId) == EOF) 
			return 0;

		while( pos<strlen(line) && !isspace(line[pos]) ) pos++;
		while( pos<strlen(line) && isspace(line[pos])) pos++;
	}

	if (iParseFlag & PARSE_LABEL_FLAG)
	{
		if(sscanf(line+pos,"%s",label) == EOF)
			return 0;

		while( pos<strlen(line) && !isspace(line[pos])) pos++;
		while( pos<strlen(line) && isspace(line[pos])) pos++;
	}

	while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) && (wpos<max_words)) 
	{
		while(isspace((int)line[pos])) pos++;
		while((!isspace((int)line[pos])) && line[pos]) pos++;

		if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) 
		{
			//it is the query id 
			doc.queryid=(long)wnum;
		}
		else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) 
		{
			//it is the example-dependent cost factor
			doc.costfactor=(double)weight;
		}
		else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) 
		{
			//it is a regular feature
			if(wnum<=0) 
			{ 
				printf("Line: %s\n",line);
				perror ("Feature numbers must be larger or equal to 0!\n"); 
				exit (1); 
			}

			if((wpos>0) && ((doc.content[wpos-1]).wnum >= wnum)) 
			{ 
				printf("Line: %s\n",line);
				perror ("Features must be in increasing order!\n"); 
				exit (1); 
			}

			doc.content[wpos].wnum=wnum;
			doc.content[wpos].weight=(float)weight; 

			wpos++;
		}
		else 
		{
			printf("'%s' in LINE: %s\n",featurepair,line);
			perror ("Cannot parse feature/value pair!\n"); 
			exit (1); 
		}
	}

	if (!wpos)
		ret = 0;

	(doc.content[wpos]).wnum=0;
	numwords = wpos+1;
	doc.dim_content = wpos;

	if (iParseFlag & PARSE_ID_FLAG)
	{
		doc.DocId = atol(chDocId);
	}
	else
	{
		doc.DocId = lLineIndex++;
	}

	doc.twonorm_sq=sprod_ss(doc.content,doc.content);

	return ret;
}

void CDocList::ReadVector( string strFile )
{
	long lLinelen;
	long max_docs,max_words;

	cout << "Traversing the vector file to get parameter info..." ;
	nol_ll( (char *)strFile.c_str(), max_docs, max_words, lLinelen); // scan size of input file 
	cout << "done." << endl;

	docs = (DOC *)my_malloc(sizeof(DOC)*max_docs);         // feature vectors 

	lLinelen += 2;

	char *line;
	DOC doc;
	long lIndex=0,wpos;
	char doc_label[MAX_LABEL_LEN];
	long empty_line = 0;

	cout << "Scanning examples...\n";

	line = (char *)my_malloc( sizeof(char)*lLinelen );

	ifstream ifile(strFile.c_str());

	doc.content = (WORDITEM *)my_malloc(sizeof(WORDITEM)*(max_words+10));

	SDoc sDoc;

	lIndex=0;
	MaxWordsCount = 0;

	ifile.getline(line,lLinelen);
	while( strlen(line) )
	{
		if(line[0] == '#') continue;  // line contains comments 

		int iParseResult = parse_document(PARSE_ID_FLAG, line, doc, doc_label, wpos,max_words);

		//the document is empty, it is removed
		if (!iParseResult)
		{
			empty_line++;

			ifile.getline(line,lLinelen);
			continue;
		}

		//if wpos=1,the document is empty
		assert(wpos>1);

		//if the document is empty,wpos is 1
		if((wpos>1) && ((doc.content[wpos-2]).wnum > MaxWordsCount)) 
			MaxWordsCount = (doc.content[wpos-2]).wnum;

		//dimcontent: number of valid terms, not including the item with wnum=0
		docs[lIndex].dim_content = wpos-1;
		docs[lIndex].queryid = doc.queryid;
		docs[lIndex].costfactor = doc.costfactor;
		docs[lIndex].content = (WORDITEM *)my_malloc(sizeof(WORDITEM)*(wpos));
		docs[lIndex].DocId = doc.DocId;
		docs[lIndex].twonorm_sq=doc.twonorm_sq;

		mapDocId_Pos[doc.DocId] = lIndex;

		for(int i=0;i<wpos;i++) 
		{ 
			docs[lIndex].content[i]=doc.content[i];

			if (i!=wpos-1)
				setWordId.insert((docs[lIndex].content[i]).wnum);
		}

		sDoc.lDocId = doc.DocId;

		vSDoc.push_back(sDoc);

		lIndex++;  
		printf("%ld\r",lIndex);

		ifile.getline(line,lLinelen);
	}

	cout << lIndex+empty_line << " examples read, " << empty_line << " are empty and removed, " << lIndex << " documents are left." << endl;

	ifile.close();

	free(line);
	free(doc.content);
}

int ReadDoc( string& sLine, DOC& test_doc )
{
	test_doc.content = (WORDITEM *)my_malloc(sizeof(WORDITEM)*MAX_DOC_FEATURE);

	char test_doc_label[MAX_LABEL_LEN];
	long lFeatureTotal;
	int iParseResult = parse_document(PARSE_ID_FLAG, (char*)sLine.c_str(), test_doc, test_doc_label, lFeatureTotal,MAX_DOC_FEATURE);

	return iParseResult;
}

void CDocList::ReadDocList( string strFile)
{	
	ifstream ifile((char*)strFile.c_str());	

	cout << "Reading category information...";

	int iDocId;
	set<int> setDocCat;
	string sLine,sDocId,sDocCat;
	while (getline( ifile, sLine))	
	{
		setDocCat.clear();

		size_t pos = sLine.find("\t");
		string sDocId = sLine.substr( 0, pos );
		string sCatId = sLine.substr( pos+1, sLine.size()-pos-1 );

		iDocId = atoi( (char*)sDocId.c_str() );
		int iCat = atoi( (char*)sCatId.c_str() );
        
		setDocCat.insert( iCat );

		if ( setCat.find( iCat )==setCat.end() ) {
			setCat.insert( iCat );
			//vecCat.push_back( iCat );
		}

		long lPos = mapDocId_Pos[ iDocId ];
		//mapDoc2SetLabel[ iDocId ] = setDocCat;
		//vTarget.push_back( setDocCat );
		vSDoc[ lPos ].setDocCat = setDocCat;

		setDocCat.clear();
	}

	ifile.close();

	cout << endl;
}
💿 文件大小 171 K
👤 上传用户 jecksonchen
📂 所属分类人工智能/神经网络
🏷️ 相关标签

#Native #Bayes #knn #算法
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -