📄 shingle-clustering.cpp
字号:
/* Text Clustering Copyright (C) 2004 Debora "Barbara" Donato, Antonio Gulli Modified by Debojyoti Dutta, 2005 This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include "shingle-clustering.h"/********************************* SHINGLES *******************************/ /* * Below we derive from virtual function for this specific algo * */ /* * initDataSet(void) * * Called for initing the dataset (by the reader) * */void SHINGLES::initDataSet(void) { docID = 0;} /* * loadInDataSet(string& str) * * Called for any string we receive (by the reader) * * @string, the string of document * *//* ====================================================================== */void SHINGLES::loadInDataSet(string& str){ string shingle; shingleWindow[currentPointer] = str; // insert the string in the shingleWindow currentPointer = ((currentPointer+1) % SHINGLE_TEXT_WINDOW); // increment shingleWindow's pointer numWords++; // another word seen numWordsThisDocument++; if (numWordsThisDocument >= SHINGLE_TEXT_WINDOW){ // can i Create the shingle? shingle = generateShingle(docID); numShingles++; }}/* =================================================================== */ /* * finalizeLoadingDocument(void) * * Called at the end of any document (by the reader) * * */void SHINGLES::finalizeLoadingDocument(void){ // after creation of all shingles of this document, call min-wise linear minWiseLinear(PRIME_NUMBER_P, NUMBER_OF_HASH_FUNCTIONS); // all done with this document docID++; // increment docID numWordsThisDocument = 0; // numWordsThisDocument reset /* cout << endl << "Doc id " << docID << "read. On to NEXT DOCUMENT" << endl<< endl; */}/* ====================================================================== */void SHINGLES::initialize(void){ /* a = new unsigned int[NUMBER_OF_HASH_FUNCTIONS]; b = new unsigned int[NUMBER_OF_HASH_FUNCTIONS]; */ // first decide the k hash functions for random permutations for(unsigned int i = 0 ; i < NUMBER_OF_HASH_FUNCTIONS; i++){ unsigned int tmp = (unsigned int) ( (double)(PRIME_NUMBER_P-1) * rand() / ( RAND_MAX+1.0)); a.push_back(tmp); tmp = (unsigned int) ( (double)(PRIME_NUMBER_P-1) * rand() / ( RAND_MAX+1.0)); b.push_back(tmp); }}/* ====================================================================== *//* * do_clustering(void) * * Called to start clustering (by the reader) * * */void SHINGLES::do_clustering(void){ /* cout << "The store" << endl; // get the sampled shingles cout << shingleBitStore; // print it */ documentID doc1, doc2; // two documents sharing enough shingles DocDocStore dds; // this count <doc, doc, number shingles in common> shingleBitStore.sortStore(); // cout << "Sorted the store" << endl; // now show it sorted // cout << shingleBitStore; shingleBitStore.analyzeSameFingerprints(dds); // and count consecutive documents UnionFind uf(docID); // to maintain the clusters for (DOC_DOC_STORE::iterator it = dds.begin(); it != dds.end(); it++){ if (it->second >= numShingleCommonTh) { // more than this ShingleCommon Th dds.getDocDoc(it->first, doc1, doc2); // cout << "(" << doc1 << "," << doc2 << ") above threshold value:" << it->first << endl; if (uf.connect(doc1, doc2) == true){ // use a union find to mantain the clusters //cout << doc1 <<" and " << doc2 << " go in the same cluster" << endl; } } } uf.printCluster(); // dds.DumpDOC_DOC();}/* ================================================================= *//* * This generate the shingles for the tmp store * */ string SHINGLES::generateShingle(unsigned long docID) { string result; unsigned int len; // len in bit of shingle SHINGLE_BITMAP *tmp ; // a tmp bitmap SHINGLE_BITMAP *output ; // a bitmap ptr // move +1 in circular way //unsigned int forwardPointer=((currentPointer + 1) % SHINGLE_TEXT_WINDOW); result.append(shingleWindow[currentPointer]); if (SHINGLE_TEXT_WINDOW > 1) { unsigned int forwardPointer = (currentPointer+1) % SHINGLE_TEXT_WINDOW; while (forwardPointer != currentPointer){ result.append(shingleWindow[forwardPointer], 0, MAX_CHAR_LENGHT_FOR_A_WORD ); // MUST TRUNCATE forwardPointer = ((forwardPointer+1) % SHINGLE_TEXT_WINDOW); } } // generate the bitmap for this shingle output = new SHINGLE_BITMAP (); // this one goes in the bitStore len = 0; for (string::iterator it = result.begin(); it != result.end(); it++){ tmp = new SHINGLE_BITMAP ((int) (*it));#ifdef DEBUG_SHINGLES cout << *it << "=" << (int) (*it); cout << ' ' << tmp->template to_string<char, char_traits<char>, allocator<char> >() << endl;#endif *output <<= 8; // shift of a char *output |= *tmp; // append using or delete tmp; len += 8; } // Modified by Debojyoti Dutta (ddutta@usc.edu) if (SHINGLE_BITMAP_SIZE > len) *output <<= (SHINGLE_BITMAP_SIZE - len); // this is the "magic" random permutation else *output <<= (len - SHINGLE_BITMAP_SIZE); // ok, shingle bitmap has been generated. now, add it to the collection // of shingle bitmaps for this document output->setDocID(docID); // remember the docID which generated it shingleBitMapStoreTMP.push_back(output); return result; }/* ================================================================== */ /* * modulus this function takes a shingle and a large prime number p * and computes the modulus in a block manner * * @aShingle, the shingle * * @result is stored in shingleBitStore */void SHINGLES::minWiseLinear (unsigned int p, unsigned int k){ unsigned int j, block_output, powerTwo, numBits; unsigned int block_size = (unsigned int) ceil(log((double) p) / log((double)2)); numericFingerPrint result, resultI, minResult[k]; // initialize for (unsigned int i = 0; i < k; i++) minResult[k] = LONG_MAX; // for each shingle, first permute and find the finger print for (vector<SHINGLE_BITMAP *>::iterator it = shingleBitMapStoreTMP.begin(); it != shingleBitMapStoreTMP.end(); it++) { SHINGLE_BITMAP * currBitMap = (*it); /******FIRST STEP: compute P1 = X mod p ************************/ j = 1; result = block_output = 0; for(unsigned int i=0; i<SHINGLE_BITMAP_SIZE; i++){ if (i % block_size == 0) { // sum partial results in modulo result = ( (result + block_output) % p); j = 1; // reset the bit position block_output = 0; // and the block output } // ????? block_output += (currBitMap->test(i) ? 1 : 0) * j; j *=2; } result = ( (result + block_output) % p); /***** SECOND STEP: generate 0 < a_i <= p-1, 0 < b_i <= p-1, ****/ /***** : this should be done for k round ****/ /***** a_i mod p = a_i, b_i mod p = b_i ****/ /***** (aX+b) mod p = ((a mod p)(X mod p) + b mod p) mod p = (a (X mod p) + b) mod p ****/ /** this takes at most 2 log p bits **/ /***** h(X) = min (h_1(X), .... h_k(X)) where h_i(X) = a_i X + b_i mod p *****/ srand(time(NULL)); // generate k hash function for(unsigned int numHash = 0 ; numHash < k; numHash++){ /**************** computing P2 = a (P1) **********************/ resultI = result; numBits = 0; for (powerTwo = 1; powerTwo <= a[numHash] ; powerTwo *= 2){ numBits++; } powerTwo /= 2; resultI <<= numBits; // do the shift // cout << "a=" << a << " b=" << b; // cout << " numBits=" << numBits; // cout << " remaining=" << a - powerTwo; // cout << " resultI=" << resultI << endl;; for (unsigned int howManyToSum = 0; howManyToSum < a[numHash] - powerTwo; howManyToSum++){ resultI += result; // sum the remaining } /************ computing P3 = (P2 + b) mod p) ******************/ resultI = (resultI + b[numHash]) % p; // cout << " (aX+b) mod p resultI=" << resultI << endl;; /********* now preserve the minimum for each hash *****************/ if (resultI < minResult[numHash]){ minResult[numHash] = resultI; } } } // output is stored in minResult vector<SHINGLE_BITMAP *>::iterator it = shingleBitMapStoreTMP.begin(); unsigned long docID = (*it)->getDocID(); cout << "sketch (integer repre) of doc "<< docID <<": " ; for (unsigned int i = 0; i < k; i++) { cout << minResult[i] << " "; SHINGLE_FINGERPRINT* outFingerPrint=new SHINGLE_FINGERPRINT(minResult[i]); outFingerPrint->setDocID(docID); shingleBitStore.push_back(outFingerPrint); } cout << endl; shingleBitMapStoreTMP.clear(); return;}/**************************** DocDocStore *************************/void DocDocStore::Tokenize(const string& str, vector<string>& tokens, const string& delimiters = " "){ // Skip delimiters at beginning. string::size_type lastPos = str.find_first_not_of(delimiters, 0); // Find first "non-delimiter". string::size_type pos = str.find_first_of(delimiters, lastPos); while (string::npos != pos || string::npos != lastPos) { // Found a token, add it to the vector. tokens.push_back(str.substr(lastPos, pos - lastPos)); // Skip delimiters. Note the "not_of" lastPos = str.find_first_not_of(delimiters, pos); // Find next "non-delimiter" pos = str.find_first_of(delimiters, lastPos); }}/* ====================================================================== */void DocDocStore::getDocDoc(const string& docdocS, documentID& doc1, documentID& doc2){ vector<string> tokens; Tokenize(docdocS, tokens); std::istringstream is0(tokens[0]); is0 >> doc1; std::istringstream is1(tokens[1]); is1 >> doc2; ///cout << "T[0]=" << doc1 << " T[1]=" << doc2 << "\n";}/* ====================================================================== */void DocDocStore::IncrementDOC_DOC(documentID docID1, documentID docID2){ string docIDs1, docIDs2; docIDs1 = LongToString(docID1); docIDs2 = LongToString(docID2); docIDs1.append(" "); docIDs1.append(docIDs2); // cout << "Inserting " << docIDs1; docdoc.increment(docIDs1); // and save //cout << "Done incrementing " << endl; }/****************** ShingleBitSetStore *****************************/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -