📄 shingle-clustering.h
字号:
/* Text Clustering Copyright (C) 2004 Debora "Barbara" Donato, Antonio Gulli This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */#ifndef SHINGLECLUSTERS#define SHINGLECLUSTERS 1//define DEBUG_SHINGLES 1#include <string>#include <bitset> // for storing Rabin-Finger prints#include <algorithm>#include <iostream>#include <sstream>#include "Clustering.h"#include "string_hash.h"#include "union-find.h"#include "util.h"#include "types.h"using namespace std;/****** This is the prime mumber for min-wise linear permutations *******/// originally 4#define SHINGLE_TEXT_WINDOW 1 // lenght of a shingle in num words #define MAX_CHAR_LENGHT_FOR_A_WORD 10 // a word > 10 should be truncated#define SHINGLE_BITMAP_SIZE (SHINGLE_TEXT_WINDOW * MAX_CHAR_LENGHT_FOR_A_WORD * 8) // the size of shingle as bitmap // usually 8 * 10 * 7 = 560 bit//#define PRIME_NUMBER_P 290017 // prime number p -> 18 bit#define PRIME_NUMBER_P 2000003 // prime number p -> 21 bit//#define PRIME_NUMBER_P 4000037 // prime number p -> 22 bit//#define PRIME_NUMBER_P 5990081 // prime number p -> 23 bit//#define PRIME_NUMBER_P 9990017 // prime number p -> 24 bit//#define PRIME_NUMBER_P 29990011 // prime number p -> 25 bit //#define PRIME_NUMBER_P 49990069 // prime number p -> 26 bit //#define PRIME_NUMBER_P 99990001 // prime number p -> 27 bit //#define PRIME_NUMBER_P 199990013 // prime number p -> 28 bit//#define PRIME_NUMBER_P 1999999973 // prime number p -> 31 bit//((unsigned int)log(double(PRIME_NUMBER_P))) // number of bit#define SHINGLE_FINGERPRINT_SIZE 21#define NUMBER_OF_HASH_FUNCTIONS 5 // number of h_i used #define NUM_SHINGLES_COMMON_TH 2 // number shingle common th#define SHINGLE_BITMAP ShingleBitSet<SHINGLE_BITMAP_SIZE> // a shingle bitmap // used for (a x + b ) mod p#define SHINGLE_FINGERPRINT ShingleBitSet<SHINGLE_FINGERPRINT_SIZE> // a shingle bitmap/****************************** DOC_DOC ************************************/// // Mantains <doc, doc, number of common shingles>#define DOC_DOC_STORE HASHMclass DocDocStore{ private: stringHash docdoc; // doc x doc hash for counting common shingles string LongToString(documentID doc){ // convert the long (documentID) to a string std::ostringstream os; os << doc; return os.str(); } /* * used for string tokaenization from within the hash * * @str, the string * @vector, tokens * @string, delimiters (usually ' ') * */ void DocDocStore::Tokenize(const string& str, vector<string>& tokens, const string& delimiters); public: /* * IncrementDOC_DOC (incremente <doc, doc, shingle_common_count> * * @dicID1, document ID 1 * @dicID2, document ID 2 * */ void IncrementDOC_DOC(documentID docID1, documentID docID2); /* * dumpDOC_DOC * */ void DumpDOC_DOC(void){ docdoc.dumpAllValues();} /* * getDocDoc(string& docdocS, documentID& doc1, documentID& doc2); * * given a string parserize it and estract (doc1, doc2) * @dicID1, document ID 1 * @dicID2, document ID 2 * */ void getDocDoc(const string& docdocS, documentID& doc1, documentID& doc2); inline DOC_DOC_STORE::iterator begin(){ return docdoc.begin(); } inline DOC_DOC_STORE::iterator end(){ return docdoc.end(); }};/******************* ShingleBitSet ************************************///// Maintains a single bitset (which rappresents the shingle and its fingerprint//template<unsigned long s>class ShingleBitSet: public bitset<s>{ // used to create a signature derive form bitset stl // and to save doc_doc association private: unsigned long docID; // the document id this shingle comes from public: ShingleBitSet(){}; ShingleBitSet(unsigned long value): bitset<s>(value){}; ~ShingleBitSet(){}; void setDocID(unsigned long id) { docID = id; } unsigned long getDocID(void) { return docID; } unsigned long getDocID(void) const { return docID; } /************ modified by nvdd ***************/ /** move this to class SHINGLES, so it can operate on all shingles of a document, instead of just one shingle * * * long long minWiseLinear (unsigned int p, unsigned int k) * * @p, the (large) prime number (shingle's fingerprint is log p bit) * @k, how many round for h(X) = min (h_1(X), .... h_k(X)) where * h_i(X) = a_i X + b_i mod p * * @return, a 64 bit number which is then used for create a fingerprint * (usually the size of fingeprint is less than 64 bit, i.e. 22 or 40) unsigned long long minWiseLinear (unsigned int p, unsigned int k); */ ShingleBitSet operator= (const ShingleBitSet<s>& rightShingle){ // assign a bitset // cout << "Entro qui per assegnare rightIS:" << endl; // cout << rightShingle; cout << endl << "Left is: "; //cout << *this; cout << endl; if (this == &rightShingle) return (*this); this->reset(); for(unsigned int i=0 ; i<this->size(); i++){ if (rightShingle.test(i)){ this->set(i); } } return (*this); } //==================================================================== bool operator< (const ShingleBitSet<s>& otherShingle) const{ // compare a bitset for(unsigned int i=s-1 ; i>0; i--){ // this MUST follow the bit order // cout << i << ' '; // if (this->test(i)){ cout << " 1"; } // else { cout << " 0"; } // cout << " "; // if (otherShingle.test(i)){ cout << " 1"; } // else { cout << " 0"; } // cout << endl; if (this->test(i) < otherShingle.test(i)) {return true;} if (this->test(i) > otherShingle.test(i)) {return false;} } return false; } //==================================================================== bool operator== (const ShingleBitSet<s>& otherShingle) const{ // compare a bitset for(unsigned int i=s-1 ; i>0; i--){ // this MUST follow the bit order if (this->test(i) < otherShingle.test(i)) {return false;} if (this->test(i) > otherShingle.test(i)) {return false;} } return true; }}; //====================================================================namespace std{ // this is used for comparison with dereferencetemplate <class T> struct less<T *>{ // see http://www.devx.com/tips/Tip/20013 bool operator()(T const* p1, T const* p2){ if (!p1) return true; if (!p2) return false; // cout << "Doing sort" << endl; cout << *p1; cout << endl; //cout << *p2; cout << endl; return *p1 < *p2; } };}/********************************* ShingleBitSetStore ********************/////// The store where all the shingles (fingerprints) are locatedtemplate <class T>class ShingleBitSetStore{ private: vector<T *> sStore; // the Store of shingle as bitset public: inline typename vector<T *>::iterator begin() { return sStore.begin();} inline typename vector<T *>::iterator end() { return sStore.end();} inline void push_back(T * sp){ sStore.push_back(sp); } inline void clear(){ sStore.clear();} /* * Sort the Store (which contains pointer, using dereference); */ inline void sortStore(void){ sort(sStore.begin(), sStore.end(), less<T *>()); } //==================================================================== /** * * analyze same fingerprints ; */ void analyzeSameFingerprints(DocDocStore& dds){ typename vector<T *>::iterator it1, it2, it3, it4, it_end; it1 = sStore.begin(); it2 = it1; it2++; unsigned int gap; cout << "Counting same shingle " << endl; while(it1 != sStore.end() && it2 != sStore.end()){ gap = 0; // Modified by Debojyoti Dutta (ddutta@usc.edu) // We need to change the order of evaluation of the while condition while ( (it2 != sStore.end()) && (**it2 == **it1) ) { it2++; gap++;} // find the largest interval of equal shingle if (gap > 0){ // found a sequence it_end = it2-1; // end of the same shingle interval //cout << "gap=" << gap << " " << **it1 << " " << **it_end << endl; for (it3 = it1; it3 != it2; it3++){ // quadratic in the number of shingles for (it4 = it3+1; it4 != it2; it4++){ // with the same fingerprint /* cout << "**SAME shingle for: d=" << (*it3)->getDocID() << " d=" << (*it4)->getDocID() << endl; cout << "Incrdocdoc " << (*it3)->getDocID() << " " << (*it4)->getDocID() << endl; */ dds.IncrementDOC_DOC( (*it3)->getDocID(), (*it4)->getDocID()); } } } it1 = it2; // move after the interval // Modified by Debojyoti Dutta (ddutta@usc.edu) // We need to check if it2 has reached the end if (it2 != sStore.end()){ it2++; // and the second one after me } else{ break; } } } //==================================================================== /** * output operator * */ friend ostream& operator<<(ostream& os, ShingleBitSetStore& sStore){ unsigned int i = 1; os << " the store\n"; for (typename vector<T *>::iterator it = sStore.begin(); it != sStore.end(); it++){ os << i++ << ". "; os << (**it); os << " from document "; os << (*it)->getDocID(); os << endl; } return os; }};/************************ THE SHINGLE CLUSTERING ALGO ***********************/////// We derive from abstract class Clustering (algo)class SHINGLES : public Clustering{ private: string shingleWindow[SHINGLE_TEXT_WINDOW];// the shingle as text ShingleBitSetStore<SHINGLE_FINGERPRINT> shingleBitStore; // the store of the minwise linear shingles of all documents ShingleBitSetStore<SHINGLE_BITMAP> shingleBitMapStoreTMP; // this is a temp accumulator of bitmaps of all shingles of a document unsigned int currentPointer; // pointer to current word unsigned int numWords; // numWords seen so far unsigned int numWordsThisDocument; // unsigned int numShingles; // numShingles generated so far unsigned int numSampled; // number of shingles sampled unsigned int numShingleCommonTh; // # common shingles to cluster vector<unsigned int> a; // for hash functions vector<unsigned int> b; // for hash functions /* * This generate the shingles for the tmp store * */ string generateShingle(unsigned long docID); public: SHINGLES(): currentPointer(0), numWords(0), numWordsThisDocument(0), numShingles(0), numSampled(0), numShingleCommonTh(NUM_SHINGLES_COMMON_TH) {}; ~SHINGLES() {} inline unsigned int getNumShingleSampled(){ return numSampled; } /* * Below we derive from virtual function for this specific algo * */ /* * * long long minWiseLinear (unsigned int p, unsigned int k) * * @p, the (large) prime number (shingle's fingerprint is log p bit) * @k, how many round for h(X) = min (h_1(X), .... h_k(X)) where h_i(X) = a_i X + b_i mod p * * output is stored in shingleBitStoreTMP */ void minWiseLinear (unsigned int p, unsigned int k); /* * initDataSet(void) * * Called for initing the dataset (by the reader) * */ void initDataSet(void); /* * loadInDataSet(string& str) * * Called for any string we receive (by the reader) * * @string, the string of document * */ void loadInDataSet(string& str); /* * finalizeLoadingDocument(void) * * Called at the end of any document (by the reader) * * */ void finalizeLoadingDocument(void); // determines the k hash functions void initialize(void); /* * do_clustering(void) * * Called to start clustering (by the reader) * * */ void do_clustering(void);};#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -