📄 k-mean-clusters.h
字号:
/* Text Clustering Copyright (C) 2004 Debora "Barbara" Donato, Antonio Gulli This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#ifndef KMEANCLUSTERS#define KMEANCLUSTERS 1#include "Clustering.h"#include "docvectors.h"#include "dictionary.h"using namespace std;#include <string>/* * class DocumentDistanceMap * Note: -- used for rapid document (address) update * -- mantain the distance from each document (to the centroid, or whatever) * */ typedef map<document *, double> DOC_DIST_MAP;class DocumentsDistancesMap{ private: DOC_DIST_MAP m; // contain the association document distance (for a cluster)public: typedef std::pair<document *, double> doc_distance_t; // needed to mantain a pair inline bool insert(document * dptr, double d){ //m[dptr] = d; // insert return a pair<iterator, bool> return (m.insert(make_pair(dptr,d))).second; // second is a bool, true if inserted / false o.w. } inline bool find(document * dptr, double& value){ DOC_DIST_MAP::iterator it = m.find(dptr); if (it != m.end()) { value=it->second; return true;} else { return false; } } inline bool find(document * dptr){ DOC_DIST_MAP::iterator it = m.find(dptr); if (it != m.end()) { return true;} else { return false; } } inline bool remove(document * dptr){ DOC_DIST_MAP::iterator it = m.find(dptr); if (it != m.end()) { m.erase(it); return true;} else { return false; } } inline int size() {return m.size(); } inline DOC_DIST_MAP::iterator begin(){ return m.begin();} inline DOC_DIST_MAP::iterator end(){ return m.end();}};/* * class kmClusters used to store the k clusters. * Note: -- a cluster is a centroid and an docdistance obj (a map) * -- clusters can overlap * */ class kmCluster { private: document centroid; // the cluster centroid DocumentsDistancesMap documents; // the indexes of documents in this clusters public: inline void setCentroid(const document& c) { centroid.clear(); centroid = c;} // set centroid inline document& getCentroid(void) { return centroid;} // get centroid inline bool addDocument(document* docPTR) { return documents.insert(docPTR, 0.0); } inline bool addDocument(document* docPTR, double dist) { return documents.insert(docPTR, dist); } inline bool removeDocument(document* docPTR) {return documents.remove(docPTR); } inline bool isThereDocument(document* docPTR) { return documents.find(docPTR); } inline double getDistanceFromCentroid(document * docPTR) { // distance from centroid double dist; documents.find(docPTR, dist); return dist; } inline unsigned int size(void) { return documents.size(); } // size of the cluster void do_centroid(void); // compute the centroid };class KMEANS : public Clustering{ private: dictionary * d; // the dictionary vectorSpace vs; // the vector space document a_doc; // an auxiliar document public: KMEANS() : d(new simpleDictionary) {}; ~KMEANS(); void initDataSet(void) {docID = 0;} /************************************************************ * * loadInDataSet: was abstract it load a string in dataset * ************************************************************/ void loadInDataSet(string& str){ dict_id id_word; if (! (id_word = d->get(str))){ // not in lexicon id_word = d->add(str); // add it } a_doc.addHash(id_word); } /********************************************************************* * * loadInDataSet: was abstract it concludes the loading of a document * ********************************************************************/ void finalizeLoadingDocument(void){ a_doc.HashToVector(); // vectorize the document vs.pushVector(a_doc); // keep vector space populated cout << a_doc; a_doc.clear(); } void do_clustering(void); // lets start to cluster // determines the k hash functions (for shingling, here it is a dummy) void initialize(void) {};};#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -