📄 docvectors.cpp
字号:
/* Text Clustering Copyright (C) 2004 Debora "Barbara" Donato, Antonio Gulli This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include "docvectors.h" /* * class used for for heap opearation */class triple { public: triple(DOCUMENT::iterator iter, DOCUMENT::iterator endIter, tuple * tp) : it(iter), end(endIter), tuplePTR(tp){}; ~triple(){}; inline tuple * getTuple() const { return tuplePTR; } DOCUMENT::iterator getIterator() const { return it; } DOCUMENT::iterator endIterator() const { return end; } bool operator< (const triple &a) const{ // operator for heap cmp return a.tuplePTR->getCoordinate() < this->tuplePTR->getCoordinate(); } friend ostream& operator<<(ostream& os, const triple& t){ os << " tuple:" << *(t.tuplePTR) << endl; return os; }private: DOCUMENT::iterator it; // the current vector iterator on list DOCUMENT::iterator end; // the end of current list tuple * tuplePTR; // a pointer to the associated tuple}; /** * distance: compute the distance between two documents * @param document_y, document to which we are computing distance * @return The distance */double document::distance(document &document_y){ DOCUMENT::iterator itFirstDoc = begin(); DOCUMENT::iterator itSecondDoc = document_y.begin(); float diff = 0.0; while (itFirstDoc != end() && itSecondDoc != document_y.end()){ #ifdef DEBUG_VECTOR cout << "tuple(doc_x):" << *itFirstDoc << endl; cout << "tuple(doc_y):" << *itSecondDoc << endl;#endif if (itFirstDoc->getCoordinate() < itSecondDoc->getCoordinate()){ diff += (itFirstDoc->getValue() * itFirstDoc->getValue()); itFirstDoc++; } else if (itFirstDoc->getCoordinate() > itSecondDoc->getCoordinate()){ diff += (itSecondDoc->getValue() * itSecondDoc->getValue()); itSecondDoc++; } else { diff += (itFirstDoc->getValue() - itSecondDoc->getValue()) * (itFirstDoc->getValue() - itSecondDoc->getValue()); itFirstDoc++; itSecondDoc++; } } while (itSecondDoc != document_y.end()){ diff += (itSecondDoc->getValue() * itSecondDoc->getValue()); itSecondDoc++; } while(itFirstDoc != end()){ diff += (itFirstDoc->getValue() * itFirstDoc->getValue()); itFirstDoc++; } diff = sqrt(diff); return diff;} /** * centroid: compute the distance between two documents * @param vector<int> indexes, the indexes of vectors * @param DOCUMENT The vector of centroid return */void vectorSpace::centroid(vector<int> indexes, document ¢roid){ vector<int>::iterator it; // an iterator on indexes priority_queue<triple> Q; // a min-priority que int last_coordinate, new_coordinate; // last accessed coor float feature; // a single feature value tuple t; // a generic tuple DOCUMENT::iterator dit, eit; // a generic iterator /***************************************************************************/ /* For each document in the index, a triple is inserted in the heap */ /***************************************************************************/ for (it = indexes.begin(); it != indexes.end(); it++){ // for all documents#ifdef DEBUG_VECTOR cout << "tuple: " << *(vs.at(*it).begin()->getTuple()) << endl;#endif Q.push(triple( vs.at(*it).begin(), // the head of vector pointed by *it vs.at(*it).end(), // the tail of vector pointed by *it vs.at(*it).begin()->getTuple() // the pointer to the tuple for this vector )); } /***************************************************************************/ /* Pick the first tuple of the first document in the heap. */ /***************************************************************************/ last_coordinate = Q.top().getTuple()->getCoordinate(); // extract min from priority Q feature = Q.top().getTuple()->getValue(); // the current feature#ifdef DEBUG_VECTOR cout << "last_coordinate:" << last_coordinate << " v:" << feature << endl;#endif dit = Q.top().getIterator(); // current position iterator eit = Q.top().endIterator(); // end iterator dit++; // go to the next#ifdef DEBUG_VECTOR cout << "next tuple " << *dit << endl;#endif Q.pop(); // pop from the list if (dit != eit){ // not reached the end of the list Q.push(triple( dit, // the next tuple iterator eit, // the end of this list dit->getTuple() // the pointer to the tuple )); } /***************************************************************************/ /* For each other triple in the heap: */ /* if (same coordinate) accumulate feature values */ /* else add the current coordinate to the centroid */ /***************************************************************************/ while (!Q.empty()){ if ( (new_coordinate = Q.top().getTuple()->getCoordinate()) == last_coordinate){ // same old coordinate feature += Q.top().getTuple()->getValue(); // accumulate value#ifdef DEBUG_VECTOR cout << "c:" << Q.top().getTuple()->getCoordinate() << " v:"<< Q.top().getTuple()->getValue() <<endl; cout << "\tsame last_coordinate:" << last_coordinate << " accumulated feature:" << feature << endl;#endif } else { // a change in coordinate centroid.add(t.setTuple(last_coordinate, // create the new tuple; feature / (float) indexes.size())); // a new centroid's coordinate;#ifdef DEBUG_VECTOR cout << "\tadd to centroid " << t << endl;#endif last_coordinate = new_coordinate; // extract min from priority Q feature = Q.top().getTuple()->getValue(); } dit = Q.top().getIterator(); // current position iterator eit = Q.top().endIterator(); // end iterator dit++; // go to the next#ifdef DEBUG_VECTOR cout << "next tuple " << *dit << endl;#endif Q.pop(); // pop from the list if (dit != eit){ // not reached the end of the list Q.push(triple( dit, // the next tuple iterator eit, // the end of this list dit->getTuple() // the pointer to the tuple )); } } centroid.add(t.setTuple(last_coordinate, // create the new tuple; feature / (float) indexes.size())); // a new centroid's coordinate;} /** * select_k_random_vector: * @param unsigned int k, number of vectors * @return The array of k indexes */vector<vect_coordinate> vectorSpace::select_k_random_vectors(unsigned int k){ vector<vect_coordinate> v; vect_coordinate n = vs.size(); vect_coordinate i, r; map <vect_coordinate, bool> generated; map <vect_coordinate, bool>::iterator it; bool ok; double randNum; if (k > n){ // vector space is too small. for (i = 0; i < n; i++){ v.push_back(i); } return v; } srand(time(NULL)); for (i = n - k +1; i <= n ; i++){ ok = false; do { randNum = rand() / (RAND_MAX + 1.0); r = (int) (n * randNum); // generate a random number in [0,n) //cout << "for vs.size()=" << n << " and k=" << k << " generated r=" << r << " rand=" << randNum; it = generated.find(r); // already generated ? if (it == generated.end()){ // not! generated.insert(make_pair(r, true)); // now yes v.push_back(r); // save this coordinate //cout << " saved"; ok = true; } cout << endl; } while (!ok); // try another } return v;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -