📄 index.h

📁 k-means源码(K均值聚类算法源码)
💻 H
字号:
#ifndef _INDEX_H_#define _INDEX_H_#include <fstream>#include <sstream>#include <vector>#include <cstring>#include "DocumentVector.h"#include "Dictionary.h"#include "HTMLDocument.h"#include "stringtok.h"// TODO: Maybe use "<<" and ">>" operators instead of load and dumpclass Index {private:	Dictionary dictionary;	std::vector<DocumentVector> documents;	char* index_name;	char* dict_name;	char* docs_name;	friend class ConsineSimilarity;	class CosineSimilarity	{	private:		friend class Index;					Index& index;				CosineSimilarity(Index& idx) : index(idx) {}				// dot product calculated using tf-idf		double dot(const DocumentVector &dv1, const DocumentVector &dv2)		{			double dot = 0.0;			int max_index = dv1.get_max_word_index();			int min_index = dv1.get_min_word_index();			if(max_index > dv2.get_max_word_index())				max_index = dv2.get_max_word_index();			if(min_index < dv2.get_min_word_index())				min_index = dv2.get_min_word_index();						for(int i = min_index; i <= max_index; i++)			{				dot += dv1[i] * dv2[i];			}						// double inv_freq;			// for(int i = min_index; i <= max_index; i++)			// {			// 	inv_freq = 1 / index.term_frequency(i);			// 	dot += (dv1[i] * inv_freq) * (dv2[i] * inv_freq);			// }										// double tf1, tf2, idf;			// for(int i = min_index; i <= max_index; i++)			// {				// 	idf = log(index.documents.size() / index.docs_containing(i));			// 	tf1 = dv1[i] / dv1.get_num_words();			// 	tf2 = dv2[i] / dv2.get_num_words();			// 				// 	dot += (tf1 * idf) * (tf2 * idf);			// }			return dot;		}		double length(const DocumentVector& dv)		{			int length = 0;			for(int i = dv.get_min_word_index(); i <= dv.get_max_word_index(); i++)				length += dv[i] * dv[i];			return sqrt(length);		}			public:		double similarity(const DocumentVector& dv1, const DocumentVector& dv2)		{			return dot(dv1, dv2) / (length(dv1) * length(dv2));		}	};public:	Index(char* index_name_) : index_name(index_name_) 	{ 		dict_name = "index/index.dict";		docs_name = "index/index.docs";	} 	const std::vector<DocumentVector>& get_documents() 	{			return documents;	}		const Dictionary& get_dictionary() 	{		return dictionary;	}		int term_frequency(int index)	{		int frequency = 0;		DocumentVector tmp;		for(unsigned int i = 0; i < documents.size(); i++)		{			tmp = documents[i];			frequency += tmp[index];		}		return frequency;	}		int docs_containing(int index)	{		int num_docs = 0;		DocumentVector tmp;		for(unsigned int i = 0; i < documents.size(); i++)		{			tmp = documents[i];						if(tmp[index] != 0)				num_docs++;		}		return num_docs;	}			double similarity(DocumentVector& dv1, DocumentVector& dv2)	{		CosineSimilarity sim(*this);		return sim.similarity(dv1, dv2);	}		void add_document(HTMLDocument& doc) 	{		std::stringstream content;		content << doc;		std::vector<std::string> words;				stringtok (words, content.str());		int word_index = 0;		for (unsigned int i = 0; i < words.size(); ++i)			word_index = dictionary.add_word(words[i]);				DocumentVector dv(dictionary.size()+1, doc.get_filename());		for (unsigned int i = 0; i < words.size(); ++i)		{			word_index = dictionary[words[i]];			dv[word_index]++;		}		 	documents.insert(documents.end(), dv);	}	void load()	{		// load dictionary		std::ifstream fin(dict_name);		fin >> dictionary;		fin.close();		// load documents		int num_docs = 0;		DocumentVector tmp_dv;		fin.open(docs_name);		fin >> num_docs;		for(int i = 0; i < num_docs; i++) 		{			fin >> tmp_dv;			documents.insert(documents.end(), tmp_dv);		}		fin.close();	}	void dump() 	{		// dump dictionary		std::ofstream fout(dict_name);		fout << dictionary;		fout.close();		// dump documents		fout.open(docs_name);		fout << documents.size() << std::endl;		for(unsigned int i = 0; i < documents.size(); i++) 			fout << documents[i] << std::endl;			fout.close();	}};#endif /* _INDEX_H_ */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -