📄 k-mean-clusters.cpp
字号:
/* Text Clustering Copyright (C) 2004 Debora "Barbara" Donato, Antonio Gulli This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include "k-mean-clusters.h"class pair_iterator { public: pair_iterator(DOCUMENT::iterator iter, DOCUMENT::iterator endIter) : it(iter), end(endIter){}; ~pair_iterator(){}; DOCUMENT::iterator getIterator() const { return it; } DOCUMENT::iterator endIterator() const { return end; } bool operator< (const pair_iterator &p) const{ // operator for heap cmp return p.it->getCoordinate() < this->it->getCoordinate(); } friend ostream& operator<<(ostream& os, const pair_iterator& p){ os << " tuple:" << *(p.it) << endl; return os; }private: DOCUMENT::iterator it; // the current vector iterator on list DOCUMENT::iterator end; // the end of current list};/** * centroid: compute the centroid for cluster *///#define DEBUG_CENTROID 1void kmCluster::do_centroid(){ DOC_DIST_MAP::iterator it; // an iterator on the cluster priority_queue<pair_iterator> Q; // a min-priority que int last_coordinate, new_coordinate; // last accessed coor float feature; // a single feature value tuple t; // a generic tuple DOCUMENT::iterator dit, eit; // a generic iterator centroid.clear(); // reset old centroid /***************************************************************************/ /* For each document in the index, a triple is inserted in the heap */ /***************************************************************************/ for (it = documents.begin(); it != documents.end(); it++){ // for all documents in the cluster#ifdef DEBUG_CENTROID cerr << "tuple: " << *(it->first->begin())<< endl;#endif Q.push(pair_iterator( it->first->begin(), // the head of vector pointed by it->first(the document) it->first->end() // the tail of vector pointed by it->first(the document) )); } /***************************************************************************/ /* Pick the first tuple of the first document in the heap. */ /***************************************************************************/ last_coordinate = Q.top().getIterator()->getCoordinate(); // extract min from priority Q feature = Q.top().getIterator()->getValue(); // the current feature#ifdef DEBUG_CENTROID cout << "last_coordinate:" << last_coordinate << " v:" << feature << endl;#endif dit = Q.top().getIterator(); // current position iterator eit = Q.top().endIterator(); // end iterator dit++; // go to the next#ifdef DEBUG_CENTROID cout << "next tuple " << *dit << endl;#endif Q.pop(); // pop from the list if (dit != eit){ // not reached the end of the list Q.push(pair_iterator( dit, // the next tuple iterator eit // the end of this list )); } /***************************************************************************/ /* For each other triple in the heap: */ /* if (same coordinate) accumulate feature values */ /* else add the current coordinate to the centroid */ /***************************************************************************/ while (!Q.empty()){ if ( (new_coordinate = Q.top().getIterator()->getCoordinate()) == last_coordinate){ // same old coordinate feature += Q.top().getIterator()->getValue(); // accumulate value#ifdef DEBUG_CENTROID cout << "c:" << Q.top().getIterator()->getCoordinate() << " v:"<< Q.top().getIterator()->getValue() <<endl; cout << "\tsame last_coordinate:" << last_coordinate << " accumulated feature:" << feature << endl;#endif } else { // a change in coordinate centroid.add(t.setTuple(last_coordinate, // create the new tuple; feature / (float) size())); // a new centroid's coordinate;#ifdef DEBUG_CENTROID cout << "\tadd to centroid " << t << endl; // cout << "Centroid is now" << centroid;#endif last_coordinate = new_coordinate; // extract min from priority Q feature = Q.top().getIterator()->getValue(); } dit = Q.top().getIterator(); // current position iterator eit = Q.top().endIterator(); // end iterator dit++; // go to the next#ifdef DEBUG_CENTROID cout << "next tuple " << *dit << endl;#endif Q.pop(); // pop from the list if (dit != eit){ // not reached the end of the list Q.push(pair_iterator( dit, // the next tuple iterator eit // the end of this list )); } } centroid.add(t.setTuple(last_coordinate, // create the new tuple; feature / (float) size())); // a new centroid's coordinate;}void KMEANS::do_clustering(void){ double threshold = 3; // FIXME just a test unsigned int num_starting_clusters = 3; // FIXME just a test unsigned int th_moved = 1; // FIXME just a test unsigned int nCluster; // number of current cluster unsigned int nMoved; // number of clusters moved unsigned int nIteration; // number of iteration reached double dist; // the distance bool isThereDocument; // the document is in the cluster d->printDictionary(); cout << "vector space size: " << vs.size() << endl; vector<kmCluster> clusters; // a collection of clusters kmCluster * a_cluster; // a cluster ptr document * dPTR; // a generic pointer vector<vect_coordinate> k = vs.select_k_random_vectors(num_starting_clusters); for (vector<vect_coordinate>::iterator it = k.begin(); it != k.end(); it++){ a_cluster = new kmCluster(); // create new cluster a_cluster->setCentroid(vs.at(*it)); // set the centroid clusters.push_back(*a_cluster); // insert in the clusters' set cout << "K:" << *it << endl; cout << "Number of clusters: " << clusters.size(); } bool convergence = false; while (!convergence) { nIteration = 0; // number of iteration done nMoved = 0; // no document movement for(vector<document>::iterator docIT=vs.begin(); docIT != vs.end(); docIT++){ // for each document cout << "document: " << *docIT; dPTR = &*docIT; // from iterator to ptr nCluster = 0; // current cluster is zero for (vector<kmCluster>::iterator cluster=clusters.begin(); // for each cluster; cluster != clusters.end(); cluster++){ if ((dist = docIT->distance(cluster->getCentroid())) < threshold){ if ((isThereDocument = cluster->addDocument(dPTR, dist)) == true){ // assign to this cluste cout << "\t Document was added to cluster" << nCluster << endl; nMoved++; } else { cout << "Document is already in cluster " << nCluster << endl; } } else { if ((isThereDocument = cluster->removeDocument(dPTR)) == true){ // go cout << "\t Document was removed from cluster" << nCluster << endl; nMoved++; } else { cout << "Document fits well in " << nCluster << endl; } } nCluster++; } // for each cluster } //for each document cout << "***> nMoved: " << nMoved << endl; if (nMoved < th_moved){ // a "little" movement convergence = true; // means convergence } else { // too movement nCluster = 0; // means we have to continue for (vector<kmCluster>::iterator cluster=clusters.begin(); cluster != clusters.end(); cluster++){ // foreach cluster cluster->do_centroid(); // re-update centroid cout << " updated centroid for cluster " << nCluster << cluster->getCentroid() << endl; nCluster++; } } nIteration++; // another iteration (count from 1) cout << "number Iteration " << nIteration << endl; } // convergence}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -