📄 index.h
字号:
#ifndef _INDEX_H_#define _INDEX_H_#include <fstream>#include <sstream>#include <vector>#include <cstring>#include "DocumentVector.h"#include "Dictionary.h"#include "HTMLDocument.h"#include "stringtok.h"// TODO: Maybe use "<<" and ">>" operators instead of load and dumpclass Index {private: Dictionary dictionary; std::vector<DocumentVector> documents; char* index_name; char* dict_name; char* docs_name; friend class ConsineSimilarity; class CosineSimilarity { private: friend class Index; Index& index; CosineSimilarity(Index& idx) : index(idx) {} // dot product calculated using tf-idf double dot(const DocumentVector &dv1, const DocumentVector &dv2) { double dot = 0.0; int max_index = dv1.get_max_word_index(); int min_index = dv1.get_min_word_index(); if(max_index > dv2.get_max_word_index()) max_index = dv2.get_max_word_index(); if(min_index < dv2.get_min_word_index()) min_index = dv2.get_min_word_index(); for(int i = min_index; i <= max_index; i++) { dot += dv1[i] * dv2[i]; } // double inv_freq; // for(int i = min_index; i <= max_index; i++) // { // inv_freq = 1 / index.term_frequency(i); // dot += (dv1[i] * inv_freq) * (dv2[i] * inv_freq); // } // double tf1, tf2, idf; // for(int i = min_index; i <= max_index; i++) // { // idf = log(index.documents.size() / index.docs_containing(i)); // tf1 = dv1[i] / dv1.get_num_words(); // tf2 = dv2[i] / dv2.get_num_words(); // // dot += (tf1 * idf) * (tf2 * idf); // } return dot; } double length(const DocumentVector& dv) { int length = 0; for(int i = dv.get_min_word_index(); i <= dv.get_max_word_index(); i++) length += dv[i] * dv[i]; return sqrt(length); } public: double similarity(const DocumentVector& dv1, const DocumentVector& dv2) { return dot(dv1, dv2) / (length(dv1) * length(dv2)); } };public: Index(char* index_name_) : index_name(index_name_) { dict_name = "index/index.dict"; docs_name = "index/index.docs"; } const std::vector<DocumentVector>& get_documents() { return documents; } const Dictionary& get_dictionary() { return dictionary; } int term_frequency(int index) { int frequency = 0; DocumentVector tmp; for(unsigned int i = 0; i < documents.size(); i++) { tmp = documents[i]; frequency += tmp[index]; } return frequency; } int docs_containing(int index) { int num_docs = 0; DocumentVector tmp; for(unsigned int i = 0; i < documents.size(); i++) { tmp = documents[i]; if(tmp[index] != 0) num_docs++; } return num_docs; } double similarity(DocumentVector& dv1, DocumentVector& dv2) { CosineSimilarity sim(*this); return sim.similarity(dv1, dv2); } void add_document(HTMLDocument& doc) { std::stringstream content; content << doc; std::vector<std::string> words; stringtok (words, content.str()); int word_index = 0; for (unsigned int i = 0; i < words.size(); ++i) word_index = dictionary.add_word(words[i]); DocumentVector dv(dictionary.size()+1, doc.get_filename()); for (unsigned int i = 0; i < words.size(); ++i) { word_index = dictionary[words[i]]; dv[word_index]++; } documents.insert(documents.end(), dv); } void load() { // load dictionary std::ifstream fin(dict_name); fin >> dictionary; fin.close(); // load documents int num_docs = 0; DocumentVector tmp_dv; fin.open(docs_name); fin >> num_docs; for(int i = 0; i < num_docs; i++) { fin >> tmp_dv; documents.insert(documents.end(), tmp_dv); } fin.close(); } void dump() { // dump dictionary std::ofstream fout(dict_name); fout << dictionary; fout.close(); // dump documents fout.open(docs_name); fout << documents.size() << std::endl; for(unsigned int i = 0; i < documents.size(); i++) fout << documents[i] << std::endl; fout.close(); }};#endif /* _INDEX_H_ */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -