⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 shingle-clustering.cpp

📁 聚类分析程序 k-means 编译环境 gcc/stl
💻 CPP
字号:
/*    Text Clustering  Copyright (C) 2004 Debora "Barbara" Donato, Antonio Gulli  Modified by Debojyoti Dutta, 2005     This library is free software; you can redistribute it and/or modify it   under the terms of the GNU Lesser General Public License as published by   the Free Software Foundation; either version 2.1 of the License, or   (at your option) any later version.  This library is distributed in the hope that it will be useful, but   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY   or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public   License for more details.  You should have received a copy of the GNU Lesser General Public License   along with this library; if not, write to the Free Software Foundation, Inc.,   59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#include "shingle-clustering.h"/********************************* SHINGLES *******************************/  /*   * Below we derive from virtual function for this specific algo   *   */  /*   * initDataSet(void)   *   *   Called for initing the dataset (by the reader)   *   */void SHINGLES::initDataSet(void) { docID = 0;}  /*   * loadInDataSet(string& str)    *   *     Called for any string we receive (by the reader)   *   *     @string, the string of document   *   *//* ====================================================================== */void SHINGLES::loadInDataSet(string& str){  string shingle;  shingleWindow[currentPointer] = str;                     // insert the string in the shingleWindow  currentPointer = ((currentPointer+1) % SHINGLE_TEXT_WINDOW);    // increment shingleWindow's pointer  numWords++;                                              // another word seen  numWordsThisDocument++;  if (numWordsThisDocument >= SHINGLE_TEXT_WINDOW){          // can i Create the shingle?    shingle = generateShingle(docID);    numShingles++;  }}/* =================================================================== */  /*   * finalizeLoadingDocument(void)   *   *     Called at the end of any document (by the reader)   *   *   */void SHINGLES::finalizeLoadingDocument(void){  // after creation of all shingles of this document, call min-wise linear  minWiseLinear(PRIME_NUMBER_P, NUMBER_OF_HASH_FUNCTIONS);      // all done with this document  docID++;                                  // increment docID  numWordsThisDocument = 0;                 // numWordsThisDocument reset  /*  cout << endl << "Doc id " << docID << "read. On to NEXT DOCUMENT"        << endl<< endl;  */}/* ====================================================================== */void SHINGLES::initialize(void){  /*  a = new unsigned int[NUMBER_OF_HASH_FUNCTIONS];  b = new unsigned int[NUMBER_OF_HASH_FUNCTIONS];  */  // first decide the k hash functions for random permutations  for(unsigned int i = 0 ; i < NUMBER_OF_HASH_FUNCTIONS; i++){                 unsigned int tmp = (unsigned int) ( (double)(PRIME_NUMBER_P-1) * rand() /			   ( RAND_MAX+1.0));      a.push_back(tmp);    tmp = (unsigned int) ( (double)(PRIME_NUMBER_P-1) * rand() /			   ( RAND_MAX+1.0));      b.push_back(tmp);  }}/* ====================================================================== *//*   * do_clustering(void)   *   *     Called to start clustering (by the reader)   *   *   */void SHINGLES::do_clustering(void){  /*  cout << "The store" << endl;            // get the sampled shingles  cout << shingleBitStore;                // print it  */  documentID doc1, doc2;                    // two documents sharing enough shingles  DocDocStore dds;                          // this count <doc, doc, number shingles in common>    shingleBitStore.sortStore();  // cout << "Sorted the store" << endl;                 // now show it sorted  // cout << shingleBitStore;  shingleBitStore.analyzeSameFingerprints(dds);   // and count consecutive documents  UnionFind uf(docID);                     // to maintain the clusters  for (DOC_DOC_STORE::iterator it = dds.begin(); it != dds.end(); it++){        if (it->second >= numShingleCommonTh) {        // more than this ShingleCommon Th            dds.getDocDoc(it->first, doc1, doc2);      // cout << "(" << doc1 << "," << doc2 << ") above threshold value:"  << it->first << endl;      if (uf.connect(doc1, doc2) == true){           	// use a union find to mantain the clusters	//cout << doc1 <<" and " << doc2 << " go in the same cluster" << endl;      }    }  }  uf.printCluster();  //  dds.DumpDOC_DOC();}/* ================================================================= *//* * This generate the shingles for the tmp store * */ string SHINGLES::generateShingle(unsigned long docID) {   string result;  unsigned int len;                          // len in bit of shingle  SHINGLE_BITMAP *tmp ;                      // a tmp bitmap    SHINGLE_BITMAP *output ;                   // a bitmap ptr      // move +1 in circular way      //unsigned int forwardPointer=((currentPointer + 1) %  SHINGLE_TEXT_WINDOW);   result.append(shingleWindow[currentPointer]);    if (SHINGLE_TEXT_WINDOW > 1) {    unsigned int forwardPointer = (currentPointer+1) % SHINGLE_TEXT_WINDOW;    while (forwardPointer != currentPointer){            result.append(shingleWindow[forwardPointer], 0,  		    MAX_CHAR_LENGHT_FOR_A_WORD );     // MUST TRUNCATE            forwardPointer = ((forwardPointer+1) %  SHINGLE_TEXT_WINDOW);    }  }   // generate the bitmap for this shingle  output = new SHINGLE_BITMAP ();   // this one goes in the bitStore    len = 0;  for (string::iterator it = result.begin(); it != result.end(); it++){        tmp = new SHINGLE_BITMAP ((int) (*it));#ifdef DEBUG_SHINGLES    cout  <<  *it <<  "=" << (int) (*it);    cout << ' ' << tmp->template to_string<char, char_traits<char>, allocator<char> >() << endl;#endif    *output <<= 8;                  // shift of a char    *output |= *tmp;                // append using or    delete tmp;    len += 8;  }  // Modified by Debojyoti Dutta (ddutta@usc.edu)  if (SHINGLE_BITMAP_SIZE > len)    *output <<= (SHINGLE_BITMAP_SIZE - len);    // this is the "magic" random permutation  else    *output <<= (len - SHINGLE_BITMAP_SIZE);    // ok, shingle bitmap has been generated. now, add it to the collection   // of shingle bitmaps for this document  output->setDocID(docID);        // remember the docID which generated it  shingleBitMapStoreTMP.push_back(output);    return result;   }/* ================================================================== */  /*   * modulus this function takes a shingle and a large prime number p    * and computes the modulus in a block manner   *     * @aShingle, the shingle   *    * @result is stored in shingleBitStore   */void SHINGLES::minWiseLinear (unsigned int p, unsigned int k){  unsigned int j, block_output, powerTwo, numBits;  unsigned int block_size = (unsigned int) ceil(log((double) p) /						log((double)2));  numericFingerPrint result, resultI,  minResult[k];   // initialize  for (unsigned int i = 0; i < k; i++)    minResult[k] = LONG_MAX;  // for each shingle, first permute and find the finger print  for (vector<SHINGLE_BITMAP *>::iterator it = shingleBitMapStoreTMP.begin();       it != shingleBitMapStoreTMP.end(); it++) {        SHINGLE_BITMAP * currBitMap = (*it);    /******FIRST STEP: compute P1 = X mod p ************************/    j = 1;    result = block_output = 0;    for(unsigned int i=0;  i<SHINGLE_BITMAP_SIZE; i++){      if (i % block_size == 0) {      	// sum partial results in modulo	result = ( (result + block_output) %  p);   	j = 1;                                      // reset the bit position	block_output = 0;                           // and the block output      }      // ?????      block_output += (currBitMap->test(i) ? 1 : 0) * j;      j *=2;    }    result = ( (result + block_output) %  p);        /***** SECOND STEP: generate 0 < a_i <= p-1, 0 < b_i <= p-1,  ****/    /*****             : this should be done for k round          ****/    /*****             a_i mod p = a_i, b_i mod p  = b_i          ****/    /***** (aX+b) mod p = ((a mod p)(X mod p) + b mod p) mod p 	                = (a (X mod p) + b) mod p                 ****/    /**                  this takes at most 2 log p bits            **/    /*****  h(X) = min (h_1(X), .... h_k(X)) 	    where h_i(X) = a_i X + b_i mod p        *****/    srand(time(NULL));    // generate k hash function    for(unsigned int numHash = 0 ; numHash < k; numHash++){                   /**************** computing P2 = a (P1) **********************/      resultI = result;      numBits = 0;      for (powerTwo = 1; powerTwo <= a[numHash] ; powerTwo *= 2){ numBits++; }      powerTwo /= 2;      resultI <<= numBits;                            // do the shift      // cout << "a=" << a << " b=" << b;      // cout << " numBits=" << numBits;          // cout << " remaining=" << a - powerTwo;      // cout << " resultI=" << resultI << endl;;      for (unsigned int howManyToSum = 0; howManyToSum < a[numHash] - powerTwo;	   howManyToSum++){	resultI += result;                            // sum the remaining         }       /************ computing P3 = (P2 + b) mod p) ******************/       resultI = (resultI + b[numHash]) % p;      // cout << " (aX+b) mod p resultI=" << resultI << endl;;      /********* now preserve the minimum for each hash  *****************/      if (resultI < minResult[numHash]){      minResult[numHash] = resultI;      }    }  }  // output is stored in minResult  vector<SHINGLE_BITMAP *>::iterator it = shingleBitMapStoreTMP.begin();  unsigned long docID = (*it)->getDocID();  cout << "sketch (integer repre) of doc "<< docID <<":  " ;  for (unsigned int i = 0; i < k; i++) {    cout << minResult[i] << " ";    SHINGLE_FINGERPRINT* outFingerPrint=new SHINGLE_FINGERPRINT(minResult[i]);    outFingerPrint->setDocID(docID);    shingleBitStore.push_back(outFingerPrint);  }  cout << endl;  shingleBitMapStoreTMP.clear();  return;}/**************************** DocDocStore *************************/void DocDocStore::Tokenize(const string& str,			   vector<string>& tokens,			   const string& delimiters = " "){  // Skip delimiters at beginning.  string::size_type lastPos = str.find_first_not_of(delimiters, 0);  // Find first "non-delimiter".    string::size_type pos     = str.find_first_of(delimiters, lastPos);        while (string::npos != pos || string::npos != lastPos)      {        // Found a token, add it to the vector.        tokens.push_back(str.substr(lastPos, pos - lastPos));        // Skip delimiters.  Note the "not_of"        lastPos = str.find_first_not_of(delimiters, pos);        // Find next "non-delimiter"        pos = str.find_first_of(delimiters, lastPos);      }}/* ====================================================================== */void DocDocStore::getDocDoc(const string& docdocS, documentID& doc1, documentID& doc2){    vector<string> tokens;  Tokenize(docdocS, tokens);  std::istringstream is0(tokens[0]);  is0 >> doc1;  std::istringstream is1(tokens[1]);  is1 >> doc2;  ///cout << "T[0]=" << doc1 << " T[1]=" << doc2 << "\n";}/* ====================================================================== */void DocDocStore::IncrementDOC_DOC(documentID docID1, documentID docID2){  string docIDs1, docIDs2;  docIDs1 = LongToString(docID1);  docIDs2 = LongToString(docID2);  docIDs1.append(" ");  docIDs1.append(docIDs2);  // cout << "Inserting " << docIDs1;   docdoc.increment(docIDs1);                                // and save  //cout << "Done incrementing " << endl;  }/****************** ShingleBitSetStore *****************************/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -