⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 k-mean-clusters.cpp

📁 聚类分析程序 k-means 编译环境 gcc/stl
💻 CPP
字号:
/*    Text Clustering  Copyright (C) 2004 Debora "Barbara" Donato, Antonio Gulli  This library is free software; you can redistribute it and/or modify it   under the terms of the GNU Lesser General Public License as published by   the Free Software Foundation; either version 2.1 of the License, or   (at your option) any later version.  This library is distributed in the hope that it will be useful, but   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY   or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public   License for more details.  You should have received a copy of the GNU Lesser General Public License   along with this library; if not, write to the Free Software Foundation, Inc.,   59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include "k-mean-clusters.h"class pair_iterator { public:  pair_iterator(DOCUMENT::iterator iter, DOCUMENT::iterator endIter) :     it(iter), end(endIter){};  ~pair_iterator(){};  DOCUMENT::iterator getIterator() const { return it; }  DOCUMENT::iterator endIterator() const { return end; }  bool operator< (const pair_iterator &p) const{ // operator for heap cmp    return   p.it->getCoordinate() < this->it->getCoordinate();  }  friend ostream& operator<<(ostream& os, const pair_iterator& p){        os << " tuple:" << *(p.it)  << endl;    return os;  }private:  DOCUMENT::iterator it;       // the current vector iterator on list  DOCUMENT::iterator end;      // the end of current list};/** * centroid: compute the centroid for cluster *///#define DEBUG_CENTROID 1void kmCluster::do_centroid(){    DOC_DIST_MAP::iterator it;            // an iterator on the cluster  priority_queue<pair_iterator> Q;      // a min-priority que  int last_coordinate, new_coordinate;  // last accessed coor    float feature;                        // a single feature value  tuple t;                              // a generic tuple  DOCUMENT::iterator dit, eit;          // a generic iterator  centroid.clear();                     // reset old centroid  /***************************************************************************/  /*    For each document in the index, a triple is inserted in the heap     */  /***************************************************************************/      for (it = documents.begin(); it != documents.end(); it++){ // for all documents in the cluster#ifdef DEBUG_CENTROID          cerr << "tuple: " << *(it->first->begin())<< endl;#endif    Q.push(pair_iterator(  		  it->first->begin(),             // the head of vector pointed by it->first(the document)		  it->first->end()                // the tail of vector pointed by it->first(the document)  		  ));  }      /***************************************************************************/  /*    Pick the first tuple of the first document in the heap.              */  /***************************************************************************/  last_coordinate = Q.top().getIterator()->getCoordinate(); // extract min from priority Q  feature = Q.top().getIterator()->getValue();              // the current feature#ifdef DEBUG_CENTROID   cout << "last_coordinate:" << last_coordinate << " v:" << feature << endl;#endif   dit = Q.top().getIterator();                           // current position iterator   eit = Q.top().endIterator();                           // end iterator   dit++;                                                  // go to the next#ifdef DEBUG_CENTROID           cout << "next tuple " << *dit << endl;#endif     Q.pop();                                             // pop from the list           if (dit != eit){                                     // not reached the end of the list       Q.push(pair_iterator(			    dit,                          // the next tuple iterator			    eit                           // the end of this list		     ));     }     /***************************************************************************/   /*    For each other triple in the heap:                                   */   /*        if (same coordinate) accumulate feature values                   */   /*          else add the current coordinate to the centroid                */   /***************************************************************************/     while (!Q.empty()){                              if ( (new_coordinate = Q.top().getIterator()->getCoordinate()) 	    == last_coordinate){ // same old coordinate	 feature += Q.top().getIterator()->getValue();           // accumulate value#ifdef DEBUG_CENTROID	 cout << "c:" <<  Q.top().getIterator()->getCoordinate() << " v:"<< Q.top().getIterator()->getValue() <<endl;	 cout << "\tsame last_coordinate:" << last_coordinate << " accumulated feature:" << feature << endl;#endif       } else {                                               // a change in coordinate	 centroid.add(t.setTuple(last_coordinate,             // create the new tuple;				 feature / 				 (float) size()));            // a new centroid's coordinate;#ifdef DEBUG_CENTROID	 cout << "\tadd to centroid " << t << endl;	 //	 cout << "Centroid is now" << centroid;#endif	 last_coordinate = new_coordinate;                    // extract min from priority Q	 feature = Q.top().getIterator()->getValue();       }       dit = Q.top().getIterator();                           // current position iterator       eit = Q.top().endIterator();                           // end iterator       dit++;         // go to the next#ifdef DEBUG_CENTROID       cout << "next tuple " << *dit << endl;#endif       Q.pop();                                               // pop from the list        if (dit != eit){                                       // not reached the end of the list	 Q.push(pair_iterator(			      dit,                            // the next tuple iterator			      eit                             // the end of this list			      ));       }     }     centroid.add(t.setTuple(last_coordinate,             // create the new tuple;			     feature / 			     (float) size()));            // a new centroid's coordinate;}void KMEANS::do_clustering(void){  double threshold = 3;                      // FIXME just a test  unsigned int num_starting_clusters = 3;     // FIXME just a test  unsigned int th_moved = 1;                  // FIXME just a test    unsigned int nCluster;                      // number of current cluster  unsigned int nMoved;                        // number of clusters moved  unsigned int nIteration;                    // number of iteration reached  double dist;                                // the distance  bool isThereDocument;                       // the document is in the cluster  d->printDictionary();  cout << "vector space size: " << vs.size() << endl;  vector<kmCluster> clusters;                 // a collection of clusters  kmCluster * a_cluster;                      // a cluster ptr  document * dPTR;                            // a generic pointer  vector<vect_coordinate> k = vs.select_k_random_vectors(num_starting_clusters);  for (vector<vect_coordinate>::iterator it = k.begin(); it != k.end(); it++){    a_cluster = new kmCluster();              // create new cluster    a_cluster->setCentroid(vs.at(*it));       // set the centroid    clusters.push_back(*a_cluster);           // insert in the clusters' set    cout << "K:" << *it << endl;    cout << "Number of clusters: " << clusters.size();  }  bool convergence = false;  while (!convergence) {        nIteration = 0;                                                 // number of iteration done    nMoved = 0;                                                     // no document movement    for(vector<document>::iterator docIT=vs.begin(); docIT != vs.end(); docIT++){ // for each document            cout << "document: " << *docIT;          dPTR = &*docIT;                                               // from iterator to ptr      nCluster = 0;                                                 // current cluster is zero      for (vector<kmCluster>::iterator cluster=clusters.begin();    // for each cluster;	   cluster != clusters.end(); cluster++){ 	  	  if ((dist = docIT->distance(cluster->getCentroid())) < threshold){ 	    if ((isThereDocument = cluster->addDocument(dPTR, dist)) == true){  // assign to this cluste	      cout << "\t Document was added to cluster" << nCluster << endl;	      nMoved++;	    } 	    else { 	      cout << "Document is already in cluster " << nCluster << endl; 	    }	  } else {	    if ((isThereDocument = cluster->removeDocument(dPTR)) == true){  // go	      cout << "\t Document was removed from cluster" << nCluster << endl;            	      	      nMoved++;	    } else {	      cout << "Document fits well in " << nCluster << endl; 	    }	  } 	nCluster++;      } // for each cluster    } //for each document    cout << "***> nMoved: " << nMoved << endl;                                if (nMoved < th_moved){                                         // a "little" movement                        convergence = true;                                           // means convergence        } else {                                                        // too movement      nCluster = 0;                                                 // means we have to continue      for (vector<kmCluster>::iterator cluster=clusters.begin();   	   cluster != clusters.end(); cluster++){                   // foreach cluster	cluster->do_centroid();                                     //  re-update centroid      	cout << " updated centroid for cluster " << nCluster << cluster->getCentroid() << endl;	nCluster++;      }    }    nIteration++;                                                   // another iteration (count from 1)    cout << "number Iteration " << nIteration << endl;  } // convergence}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -