⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 doclist.h

📁 knn和Native Bayes算法实现,两个实现在一起,是数据挖掘和机器学习中的内容.
💻 H
字号:
#ifndef __DOCLIST_H_
#define __DOCLIST_H_

#pragma warning( disable : 4786 )

#include <vector>
#include <string>
#include <vector>
#include <set>
#include <map>
#include <fstream>
#include <algorithm>


using namespace std;


#define MAX_DOC_FEATURE 400*1024
#define MAX_LABEL_LEN   30

enum {PARSE_NONE_FLAG = 0x0, PARSE_ID_FLAG = 0x1, PARSE_LABEL_FLAG = 0x2, PARSE_BOTH_FLAG = 0x3};

typedef struct {
	long    wnum;	               // word number 
	float    weight;              // word weight 
} WORDITEM;

typedef struct
{
	long lDocId;
	set<int> setDocCat;
	set<int> setPredCat;
	double dbPredValue;
	bool bIsEmpty;
}SDoc;

typedef struct {
	long    docnum;              
	long	DocId;				//keep unchanged
	long    queryid;             //
	double  costfactor;          //
	double  twonorm_sq;          //squared euclidian length

	WORDITEM    *content;          //The content/values pairs 
	int		dim_content;
} DOC;

class CDocList
{
public:
	void ReadVector( string strFile);
	void ReadDocList(string strFile);
protected:
private:
	int MaxWordsCount;
	set<long> setWordId;
	set<int> setCat;
	//map<long,set<int> > mapDoc2SetLabel;
public:
	DOC *docs;
	map<long,long> mapDocId_Pos;//docid->index in docs array
	//vector< set<int> > vSDocCat;
	vector <SDoc> vSDoc;

};

void *my_malloc(size_t size);
float sprod_ss(WORDITEM *a, WORDITEM *b);
int ReadDoc( string& sLine, DOC& test_doc );

//sLine begins with non sSep character
template<typename T> void GetvIdFromStr( string& sLine, string sSep, vector<T>& vId )
{
	string sId;
	int iIndexId;

	vId.clear();

	int posBegin = 0;
	int pos = sLine.find( sSep, posBegin );
	while ( pos>0 ) {
		sId = sLine.substr( posBegin, pos-posBegin );
		iIndexId = atoi( (char*)sId.c_str() );
		vId.push_back( iIndexId );

		posBegin = pos+1;
		pos = sLine.find( sSep, posBegin );
	}

	sId = sLine.substr( posBegin, sLine.size()-posBegin );
	iIndexId = atoi( (char*)sId.c_str() );

	vId.push_back( iIndexId );
}

template<typename T> void ReadIdSetMap( map<T,set<T > >& mapIdMap, string strFile)
{
	mapIdMap.clear();

	cout << "Reading Id Map..." << endl;

	ifstream ifile((char*)strFile.c_str());

	int iPageId;

	string sLine;
	string sId;
	int iIndexIdNull = 0;
	int iLineNum = 0;
	size_t pos, posBegin = 0;
	while ( getline( ifile, sLine) ) {
		iLineNum++;
		pos = sLine.find( '\t', posBegin );
		if ( pos<0 ) {
			iIndexIdNull++;
			continue;
		}

		sId = sLine.substr( 0, pos );
		iPageId = atoi( (char*)sId.c_str() );

		string sLast = sLine.substr( pos+1, sLine.size()-pos-1 );
		vector<int> vId;
		GetvIdFromStr( sLast, "\t", vId );

		for ( unsigned i=0;i<vId.size();i++ )
			mapIdMap[ iPageId ].insert( vId[i] );

		vId.clear();
	}

	ifile.close();

	cout << iLineNum << " lines processed, " << iIndexIdNull << " without indexid. " << endl;
}

#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -