⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 shingle-clustering.h

📁 聚类分析程序 k-means 编译环境 gcc/stl
💻 H
字号:
/*    Text Clustering  Copyright (C) 2004 Debora "Barbara" Donato, Antonio Gulli  This library is free software; you can redistribute it and/or modify it   under the terms of the GNU Lesser General Public License as published by   the Free Software Foundation; either version 2.1 of the License, or   (at your option) any later version.  This library is distributed in the hope that it will be useful, but   WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY   or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public   License for more details.  You should have received a copy of the GNU Lesser General Public License   along with this library; if not, write to the Free Software Foundation, Inc.,   59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#ifndef SHINGLECLUSTERS#define SHINGLECLUSTERS 1//define DEBUG_SHINGLES 1#include <string>#include <bitset>                          // for storing Rabin-Finger prints#include <algorithm>#include <iostream>#include <sstream>#include "Clustering.h"#include "string_hash.h"#include "union-find.h"#include "util.h"#include "types.h"using namespace std;/****** This is the prime mumber for min-wise linear permutations *******/// originally 4#define SHINGLE_TEXT_WINDOW 1                                          // lenght of a shingle in num words #define MAX_CHAR_LENGHT_FOR_A_WORD 10                                             // a word > 10 should be truncated#define SHINGLE_BITMAP_SIZE (SHINGLE_TEXT_WINDOW * MAX_CHAR_LENGHT_FOR_A_WORD * 8)  // the size of shingle as bitmap                                                                                  // usually 8 * 10 * 7  = 560 bit//#define PRIME_NUMBER_P 290017                                                    // prime number p -> 18 bit#define PRIME_NUMBER_P 2000003                                                 // prime number p -> 21 bit//#define PRIME_NUMBER_P 4000037                                                 // prime number p -> 22 bit//#define PRIME_NUMBER_P 5990081                                                 // prime number p -> 23 bit//#define PRIME_NUMBER_P 9990017                                                 // prime number p -> 24 bit//#define PRIME_NUMBER_P 29990011                                                // prime number p -> 25 bit    //#define PRIME_NUMBER_P 49990069                                                // prime number p -> 26 bit  //#define PRIME_NUMBER_P 99990001                                                // prime number p -> 27 bit  //#define PRIME_NUMBER_P 199990013                                               // prime number p -> 28 bit//#define PRIME_NUMBER_P 1999999973                                              // prime number p -> 31 bit//((unsigned int)log(double(PRIME_NUMBER_P)))     // number of bit#define SHINGLE_FINGERPRINT_SIZE 21#define NUMBER_OF_HASH_FUNCTIONS  5                                              // number of h_i used #define NUM_SHINGLES_COMMON_TH 2                                                 // number shingle common th#define SHINGLE_BITMAP ShingleBitSet<SHINGLE_BITMAP_SIZE>                         // a shingle bitmap                                                                                  // used for (a x + b ) mod p#define SHINGLE_FINGERPRINT ShingleBitSet<SHINGLE_FINGERPRINT_SIZE>              // a shingle bitmap/****************************** DOC_DOC ************************************/// // Mantains <doc, doc, number of common shingles>#define DOC_DOC_STORE HASHMclass DocDocStore{ private:  stringHash docdoc;                                       // doc x doc hash for counting common shingles  string LongToString(documentID doc){                     // convert the long (documentID) to a string        std::ostringstream os; os << doc; return os.str();  }   /*   * used for string tokaenization from within the hash   *   *  @str, the string   *  @vector, tokens   *  @string, delimiters (usually ' ')   *   */  void DocDocStore::Tokenize(const string& str,			     vector<string>& tokens,			     const string& delimiters); public:  /*   * IncrementDOC_DOC (incremente <doc, doc, shingle_common_count>   *   *  @dicID1, document ID 1   *  @dicID2, document ID 2   *   */  void IncrementDOC_DOC(documentID docID1, documentID docID2);  /*   * dumpDOC_DOC   *   */  void DumpDOC_DOC(void){ docdoc.dumpAllValues();}    /*   * getDocDoc(string& docdocS, documentID& doc1, documentID& doc2);   *   *  given a string parserize it and estract (doc1, doc2)   *  @dicID1, document ID 1   *  @dicID2, document ID 2      *   */  void getDocDoc(const string& docdocS, documentID& doc1, documentID& doc2);  inline DOC_DOC_STORE::iterator begin(){ return docdoc.begin(); }  inline DOC_DOC_STORE::iterator end(){ return docdoc.end(); }};/******************* ShingleBitSet ************************************///// Maintains a single bitset (which rappresents the shingle and its fingerprint//template<unsigned long s>class ShingleBitSet: public bitset<s>{     // used to create a signature derive form bitset stl                                            // and to save doc_doc association private:  unsigned long docID;                     // the document id this shingle comes from   public:  ShingleBitSet(){};  ShingleBitSet(unsigned long value): bitset<s>(value){};  ~ShingleBitSet(){};  void setDocID(unsigned long id) { docID = id; }  unsigned long getDocID(void) { return docID; }  unsigned long getDocID(void) const { return docID; }  /************               modified by nvdd              ***************/  /** move this to class SHINGLES, so it can operate on all shingles    of a document, instead of just one shingle    *   *   * long long minWiseLinear (unsigned int p, unsigned int k)   *   * @p, the (large) prime number (shingle's fingerprint is log p bit)      * @k, how many round for h(X) = min (h_1(X), .... h_k(X)) where   * h_i(X) = a_i X + b_i mod p      *   * @return, a 64 bit number which is then used for create a fingerprint   *          (usually the size of fingeprint is less than 64 bit, i.e. 22 or 40)      unsigned long long minWiseLinear (unsigned int p, unsigned int k);  */  ShingleBitSet operator= (const ShingleBitSet<s>& rightShingle){   // assign a bitset    //    cout << "Entro qui per assegnare rightIS:" << endl;    // cout << rightShingle; cout << endl << "Left is: ";    //cout << *this; cout << endl;    if (this == &rightShingle)      return (*this);    this->reset();    for(unsigned int i=0 ; i<this->size(); i++){      if (rightShingle.test(i)){	this->set(i);      }    }    return (*this);  } //====================================================================  bool operator< (const ShingleBitSet<s>& otherShingle) const{         // compare a bitset        for(unsigned int i=s-1 ; i>0; i--){                // this MUST follow the bit order            //      cout << i << ' ';       // if (this->test(i)){ cout << " 1"; }       // else { cout << " 0"; }      // cout << " ";      // if (otherShingle.test(i)){ cout << " 1"; }       // else { cout << " 0"; }      // cout << endl;            if (this->test(i) < otherShingle.test(i)) {return true;}      if (this->test(i) > otherShingle.test(i)) {return false;}    }    return false;  }   //====================================================================  bool operator== (const ShingleBitSet<s>& otherShingle) const{     // compare a bitset        for(unsigned int i=s-1 ; i>0; i--){          // this MUST follow the bit order            if (this->test(i) < otherShingle.test(i)) {return false;}      if (this->test(i) > otherShingle.test(i)) {return false;}    }    return true;  }}; //====================================================================namespace std{           // this is used for comparison with dereferencetemplate <class T>  struct less<T *>{  // see http://www.devx.com/tips/Tip/20013        bool operator()(T const* p1, T const* p2){            if (!p1) 	return true;      if (!p2)	return false;            //      cout << "Doing sort" << endl; cout << *p1; cout << endl;      //cout << *p2; cout << endl;      return *p1 < *p2;    }  };}/********************************* ShingleBitSetStore ********************/////// The store where all the shingles (fingerprints) are locatedtemplate <class T>class ShingleBitSetStore{ private:  vector<T *> sStore;   // the Store of  shingle as bitset public:  inline typename vector<T *>::iterator begin() { return sStore.begin();}  inline typename vector<T *>::iterator end() { return sStore.end();}  inline void push_back(T * sp){ sStore.push_back(sp); }  inline void clear(){ sStore.clear();}    /*   * Sort the Store (which contains pointer, using dereference);   */  inline void sortStore(void){ sort(sStore.begin(), sStore.end(), less<T *>()); } //====================================================================  /**   *   * analyze same fingerprints                                 ;   */  void analyzeSameFingerprints(DocDocStore& dds){        typename vector<T *>::iterator it1, it2, it3, it4, it_end;        it1 = sStore.begin();    it2 = it1; it2++;    unsigned int gap;        cout << "Counting same shingle " << endl;    while(it1 != sStore.end() && it2 != sStore.end()){            gap = 0;      // Modified by Debojyoti Dutta (ddutta@usc.edu)      // We need to change the order of evaluation of the while condition      while ( (it2 != sStore.end()) && (**it2 == **it1) )	{ it2++; gap++;} // find the largest interval of equal shingle         if (gap > 0){                                              // found a sequence	it_end = it2-1;  // end of the same shingle interval	//cout << "gap=" << gap << " " << **it1 << " " << **it_end << endl;	for (it3 = it1; it3 != it2; it3++){                         // quadratic in the number of shingles 	  for (it4 = it3+1; it4 != it2; it4++){                       // with the same fingerprint	    /*	      cout << "**SAME shingle for: d=" << (*it3)->getDocID()  << " d=" << (*it4)->getDocID() << endl;	      cout << "Incrdocdoc " <<  (*it3)->getDocID() << " "  << (*it4)->getDocID() << endl;	    */	      dds.IncrementDOC_DOC( (*it3)->getDocID(), (*it4)->getDocID());	  }	}      }      it1 = it2;	                           // move after the interval      // Modified by Debojyoti Dutta (ddutta@usc.edu)      // We need to check if it2 has reached the end      if (it2 != sStore.end()){	it2++;                                     // and the second one after me	      }      else{	break;      }          }      } //====================================================================  /**   * output operator   *   */  friend ostream& operator<<(ostream& os, ShingleBitSetStore& sStore){    unsigned int i = 1;    os << " the store\n";    for (typename vector<T *>::iterator it = sStore.begin(); it != sStore.end(); it++){      os << i++ << ". ";      os << (**it); os << " from document ";      os << (*it)->getDocID(); os << endl;    }    return os;  }};/************************ THE SHINGLE CLUSTERING ALGO ***********************/////// We derive from abstract class Clustering (algo)class SHINGLES : public Clustering{ private:  string shingleWindow[SHINGLE_TEXT_WINDOW];// the shingle as text   ShingleBitSetStore<SHINGLE_FINGERPRINT> shingleBitStore;         // the store of the minwise linear shingles of all documents  ShingleBitSetStore<SHINGLE_BITMAP> shingleBitMapStoreTMP;   // this is a temp accumulator of bitmaps of all shingles of a document  unsigned int currentPointer;              // pointer to current word  unsigned int numWords;                    // numWords seen so far  unsigned int numWordsThisDocument;        //  unsigned int numShingles;                 // numShingles generated so far  unsigned int numSampled;                  // number of shingles sampled  unsigned int numShingleCommonTh;          // # common shingles to cluster  vector<unsigned int> a;                   // for hash functions  vector<unsigned int> b;                   // for hash functions   /*   * This generate the shingles for the tmp store   *   */   string generateShingle(unsigned long docID); public:  SHINGLES(): currentPointer(0), numWords(0), numWordsThisDocument(0),    numShingles(0), numSampled(0), numShingleCommonTh(NUM_SHINGLES_COMMON_TH) {};  ~SHINGLES() {}  inline unsigned int getNumShingleSampled(){ return numSampled; }  /*   * Below we derive from virtual function for this specific algo   *   */  /*   *   * long long minWiseLinear (unsigned int p, unsigned int k)   *   * @p, the (large) prime number (shingle's fingerprint is log p bit)   * @k, how many round for  h(X) = min (h_1(X), .... h_k(X))                              where h_i(X) = a_i X + b_i mod p   *   * output is stored in shingleBitStoreTMP   */  void minWiseLinear (unsigned int p, unsigned int k);  /*   * initDataSet(void)   *   *   Called for initing the dataset (by the reader)   *   */  void initDataSet(void);  /*   * loadInDataSet(string& str)    *   *     Called for any string we receive (by the reader)   *   *     @string, the string of document   *   */    void loadInDataSet(string& str);     /*   * finalizeLoadingDocument(void)   *   *     Called at the end of any document (by the reader)   *   *   */  void finalizeLoadingDocument(void);  // determines the k hash functions  void initialize(void);   /*   * do_clustering(void)   *   *     Called to start clustering (by the reader)   *   *   */  void do_clustering(void);};#endif 

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -