⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dctree.cc

📁 一个增量文本聚类的算法。 参考文献: Wai-chiu Wong, Ada Wai-chee Fu, Incremental Document Clustering for Web Page Cl
💻 CC
📖 第 1 页 / 共 2 页
字号:
#include <stdio.h>#include <stdlib.h>#include <ctype.h>#include <string.h>#include <iostream.h>#include <time.h>#include <math.h>#include "../MyLib/MyLib.h"#include "../MyLib/DC.h"struct aNode{    DC **dcObj;    int isLeaf,numChild;    struct aNode **child;};typedef struct aNode node;FILE *topicFile;int numDoc,minChild,maxChild,numFeat,numTopic,t3;int *N3,*T,height;char **feature,**topic;double simThres,t1,t2,*F;double sim0(DC *dc1,DC *dc2){    double s,w1,w2,prod,norm1,norm2;    prod = 0.0;    norm1 = 0.0;    norm2 = 0.0;    for(int i=0;i<numFeat;i++){	w1 = ((double) dc1->W[i])/((double) dc1->N);	w2 = ((double) dc2->W[i])/((double) dc2->N);	prod = prod+(w1*w2);	norm1 = norm1+(w1*w1);	norm2 = norm2+(w2*w2);    }    s = prod/sqrt(norm1*norm2);    return s;}double sim2(DC *dc1,DC *dc2){    double s,dist,w1,w2;    dist = 0.0;    for(int i=0;i<numFeat;i++){	w1 = ((double) dc1->W[i])/((double) dc1->N);	w2 = ((double) dc2->W[i])/((double) dc2->N);	dist = dist + fabs(w1-w2);    }    s = 1.0-(dist/((double) numFeat));    return s;}double sim1(DC *dc1,DC *dc2){    double s,dist,w1,w2,diff;    dist = 0.0;    for(int i=0;i<numFeat;i++){	w1 = ((double) dc1->W[i])/((double) dc1->N);	w2 = ((double) dc2->W[i])/((double) dc2->N);	diff = w1-w2;	dist = dist+(diff*diff);    }    s = 1.0-(sqrt(dist)/((double) numFeat));    return s;}double closest(node *node,DC *dc,int *tEnt){    double s,maxSim;    maxSim = -1.0;    *tEnt = 0;    for(int i=0;i<node->numChild;i++){	s = sim2(node->dcObj[i],dc);	if(maxSim < s){	    *tEnt = i;	    maxSim = s;	}    }    return maxSim;}node *splitNode(node **root,node *retNode){    	int nc1,nc2,NC1,NC2;	node *newNode;    	nc1 = (maxChild+1)/2;    	nc2 = maxChild - nc1 + 1;    /* find the first elements of two groups    */    DC *tmpDC;    double s,minSim;    int seed1,seed2;    seed1 = 0;    seed2 = 1;    minSim = 1.1;    tmpDC = new DC(-1,NULL,numFeat,NULL);    for(int i=0;i<retNode->numChild;i++){	tmpDC->merge(retNode->dcObj[i]);    }    for(int i=0;i<maxChild;i++){	for(int j=i+1;j<=maxChild;j++){	    if(j < maxChild){		s = sim2((* root)->dcObj[i],(* root)->dcObj[j]);	    }else{		s = sim2((* root)->dcObj[i],tmpDC);	    }	    if(s < minSim){		seed1 = i;		seed2 = j;		minSim = s;	    }else if(s == minSim && ((i+j)%2 == 1)){		seed1 = i;		seed2 = j;	    }	}    }    /* assign the first elements to the two groups    */    node **group1,**group2;    DC **gDC1,**gDC2;    int M;    M = maxChild-minChild+1;    group1 = new (node *)[M];    gDC1 = new (DC *)[M];    if(seed1 < maxChild){	group1[0] = (* root)->child[seed1];//	(* root)->child[seed1] = NULL;	gDC1[0] = (* root)->dcObj[seed1];//	(* root)->dcObj[seed1] = NULL;    }else{	group1[0] = retNode;	gDC1[0] = tmpDC;    }    NC1 = 1;    group2 = new (node *)[M];    gDC2 = new (DC *)[M];    if(seed2 < maxChild){	group2[0] = (* root)->child[seed2];//	(* root)->child[seed2] = NULL;	gDC2[0] = (* root)->dcObj[seed2];//	(* root)->dcObj[seed2] = NULL;    }else{	group2[0] = retNode;	gDC2[0] = tmpDC;    }    NC2 = 1;    /* assign rest entries to the groups    */    int assQ[maxChild-1],grpQ[maxChild+1],k;    double simQ[maxChild+1],tmpSim;    grpQ[seed1] = 0;    simQ[seed1] = 0.0;    grpQ[seed2] = 0;    simQ[seed2] = 0.0;    k = 0;    for(int i=0;i<=maxChild;i++){	if(i != seed1 && i != seed2){	    if(i < maxChild){	      if(seed1 < maxChild){		tmpSim = sim2((* root)->dcObj[i],(* root)->dcObj[seed1]);	      }else{		tmpSim = sim2((* root)->dcObj[i],tmpDC);	      }		grpQ[i] = 2;	      if(seed2 < maxChild){		simQ[i] = sim2((* root)->dcObj[i],(* root)->dcObj[seed2]);	      }else{		simQ[i] = sim2((* root)->dcObj[i],tmpDC);	      }		if(tmpSim > simQ[i]){		    simQ[i] = tmpSim;		    grpQ[i] = 1;		}		int j=0;		while(j<k){		    if(simQ[assQ[j]] < simQ[i]){			for(int l=k;l>j;l--){			    assQ[l] = assQ[l-1];			}			assQ[j] = i;			j = k+1;		    }else{			j++;		    }		}			if(j == k){		    assQ[k] = i; 		}	    }else{		tmpSim = sim2(tmpDC,(* root)->dcObj[seed1]);		grpQ[i] = 2;		simQ[i] = sim2(tmpDC,(* root)->dcObj[seed2]);		if(tmpSim > simQ[i]){		    simQ[i] = tmpSim;		    grpQ[i] = 1;		}		int j=0;		while(j<k){		    if(simQ[assQ[j]] < simQ[i]){			for(int l=k;l>j;l--){			    assQ[l] = assQ[l-1];			}			assQ[j] = i;			j = k+1;		    }else{			j++;		    }		}			if(j == k){		    assQ[k] = i; 		}	    }	    k++;	}    }    int r = 0;    while(NC1 < M && NC2 < M && r < maxChild-1){	if(grpQ[assQ[r]] == 1){	    if(assQ[r] < maxChild){	    	group1[NC1] = (* root)->child[assQ[r]];		gDC1[NC1] = (* root)->dcObj[assQ[r]];	    }else{	    	group1[NC1] = retNode;		gDC1[NC1] = tmpDC;	    }	    NC1++;	}else{	    if(assQ[r] < maxChild){	    	group2[NC2] = (* root)->child[assQ[r]];		gDC2[NC2] = (* root)->dcObj[assQ[r]];	    }else{	    	group2[NC2] = retNode;		gDC2[NC2] = tmpDC;	    }	    NC2++;	}	r++;    }    while(r < maxChild-1){	if(NC2 == M){	    if(assQ[r] < maxChild){                group1[NC1] = (* root)->child[assQ[r]];                gDC1[NC1] = (* root)->dcObj[assQ[r]];            }else{                group1[NC1] = retNode;                gDC1[NC1] = tmpDC;            }            NC1++;	}else{	    if(assQ[r] < maxChild){                group2[NC2] = (* root)->child[assQ[r]];                gDC2[NC2] = (* root)->dcObj[assQ[r]];            }else{                group2[NC2] = retNode;                gDC2[NC2] = tmpDC;            }            NC2++;        }	r++;    }/*printf("tmpDC: ");    dNode *dt;    dt = tmpDC->docList;    while(dt != NULL){    	printf("%d ",dt->ID);	dt = dt->next;    }    printf("\n");for(int i=0;i<maxChild-1;i++){    printf("%d ",assQ[i]);}    printf("\n");for(int i=0;i<=maxChild;i++){    printf("%d ",grpQ[i]);}    printf("\n");printf("group 1: seed = %d",seed1);for(int i=0;i<NC1;i++){    dNode *dl;    dl = gDC1[i]->docList;    printf("dc%d( ",i);    while(dl != NULL){    	printf("%d ",dl->ID);	dl = dl->next;    }    printf(") ");}printf("\n");printf("group 2: seed = %d",seed2);for(int i=0;i<NC2;i++){    dNode *dl;    dl = gDC2[i]->docList;    printf("dc%d( ",i);    while(dl != NULL){    	printf("%d ",dl->ID);	dl = dl->next;    }    printf(") ");}printf("\n");*/        newNode = (node *)malloc(sizeof(node));        newNode->isLeaf = (* root)->isLeaf;        newNode->numChild = NC2;        newNode->child = new (node *)[maxChild];        newNode->dcObj = new (DC *)[maxChild];    	for(int i=0;i<NC2;i++){	    newNode->child[i] = group2[i];	    newNode->dcObj[i] = gDC2[i];    	}        for(int i=NC2;i<maxChild;i++){       	    newNode->child[i] = NULL;       	    newNode->dcObj[i] = NULL;        }    	(* root)->numChild = NC1;    	for(int i=0;i<NC1;i++){  	    (* root)->child[i] = group1[i];  	    (* root)->dcObj[i] = gDC1[i];	}	for(int i=NC1;i<maxChild;i++){  	    (* root)->child[i] = NULL;  	    (* root)->dcObj[i] = NULL;	}	return newNode;}/*node *splitLeaf(node **root,DC *ent){	    int nc1,nc2;	    nc1 = (maxChild+1)/2;	    nc2 = maxChild - nc1 + 1;	    node *newNode;	    newNode = (node *)malloc(sizeof(node));	    newNode->isLeaf = (* root)->isLeaf;            newNode->numChild = nc2;            newNode->child = new (node *)[maxChild];            newNode->dcObj = new (DC *)[maxChild];            for(int i=0;i<maxChild;i++){            	newNode->child[i] = NULL;            	newNode->dcObj[i] = NULL;            }	    newNode->dcObj[0] = ent;	    for(int i=0;i<nc2-1;i++){		newNode->dcObj[i+1] = (* root)->dcObj[nc1+i];	  	(* root)->dcObj[nc1+i] = NULL;	    }	    (* root)->numChild = nc1;	    return newNode; }*/node *insert(node **root,DC *ent){    double sim;    int targetEnt;    sim = closest((* root),ent,&targetEnt);        if((* root)->isLeaf == 1){	if(sim >= simThres){	    (* root)->dcObj[targetEnt]->merge(ent);	    return NULL;	}else if((* root)->numChild < maxChild){	    (* root)->dcObj[(* root)->numChild] = ent;	    ((* root)->numChild)++;	    return NULL;	}else{	    /* splitting the leaf node	    */	    node *tmpNode;            tmpNode = (node *)malloc(sizeof(node));            tmpNode->isLeaf = 1;            tmpNode->numChild = 1;            tmpNode->child = new (node *)[maxChild];            tmpNode->dcObj = new (DC *)[maxChild];            for(int i=0;i<maxChild;i++){                tmpNode->child[i] = NULL;                tmpNode->dcObj[i] = NULL;            }            tmpNode->dcObj[0] = ent;	    return splitNode(root,tmpNode);	}    }else{	node *retNode;	if(sim >= t1){	    (* root)->dcObj[targetEnt]->merge(ent);	    retNode = insert(&((* root)->child[targetEnt]),ent);	}else if((* root)->numChild < maxChild){	    node *tmpNode;            tmpNode = (node *)malloc(sizeof(node));            tmpNode->isLeaf = 1;            tmpNode->numChild = 1;            tmpNode->child = new (node *)[maxChild];            tmpNode->dcObj = new (DC *)[maxChild];            for(int i=0;i<maxChild;i++){                tmpNode->child[i] = NULL;                tmpNode->dcObj[i] = NULL;            }            tmpNode->dcObj[0] = ent;	    (* root)->child[(* root)->numChild] = tmpNode;	    (* root)->dcObj[(* root)->numChild] = new DC(-1,NULL,numFeat,NULL);;	    (* root)->dcObj[(* root)->numChild]->merge(ent);	    ((* root)->numChild)++;	    retNode = NULL;	}else{            retNode = (node *)malloc(sizeof(node));            retNode->isLeaf = 1;            retNode->numChild = 1;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -