⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 dctree.cc

📁 一个增量文本聚类的算法。 参考文献: Wai-chiu Wong, Ada Wai-chee Fu, Incremental Document Clustering for Web Page Cl
💻 CC
📖 第 1 页 / 共 2 页
字号:
            retNode->child = new (node *)[maxChild];            retNode->dcObj = new (DC *)[maxChild];            for(int i=0;i<maxChild;i++){                retNode->child[i] = NULL;                retNode->dcObj[i] = NULL;            }            retNode->dcObj[0] = ent;	}	if(retNode != NULL){	    (* root)->dcObj[targetEnt] = new DC(-1,NULL,numFeat,NULL);	    for(int i=0;i<(* root)->child[targetEnt]->numChild;i++){	        (* root)->dcObj[targetEnt]->merge((* root)->child[targetEnt]->dcObj[i]);	    }	    /* there is splitting in the child node	    */	    int nc;	    nc = (* root)->numChild;	    if(nc < maxChild){		(* root)->child[nc] = retNode;		(* root)->dcObj[nc] = new DC(-1,NULL,numFeat,NULL);		for(int i=0;i<retNode->numChild;i++){		    (* root)->dcObj[nc]->merge(retNode->dcObj[i]);		}		((* root)->numChild)++;	 	(* root)->isLeaf = 0;		return NULL;	    }else{		/* splitting the nonleaf node 		*/		return splitNode(root,retNode);	    }	}else{	    return NULL;	}    }}int showTree(node *root,int lvl,int *pc,int *ncl){    int cover,N1,N2;    double tmpF,P,R;    if(lvl > height){	height = lvl;    }    if(root->isLeaf != 1){	int nd=0;    	for(int i=0;i<root->numChild;i++){	    nd = nd + root->dcObj[i]->N;	}      	for(int j=0;j<lvl;j++){	    printf("  ");	}    	printf("level=%d numChild=%d numDoc=%d\n",lvl,root->numChild,nd);	cover = 0;    	for(int i=0;i<root->numChild;i++){	    int p,maxp,maxTopic;	    double stat,maxStat;      	  	    for(int j=0;j<lvl;j++){	    	printf("  ");	    }	    printf("l%d child %d: numDoc=%d\n",lvl,i,root->dcObj[i]->N);	    for(int j=0;j<lvl;j++){	    	printf("  ");	    }	    printf("[ ");	    int flag=0;	    for(int j=0;j<numFeat;j++){ 		double df;		df = ((double) root->dcObj[i]->W[j])/((double) root->dcObj[i]->N);		if(df > t2){		    printf("%s:%.2lf ",feature[j],df);		    flag = 1;		}	    }	    printf("]\n");	    maxStat = -1.0;	    maxTopic = 0;	    for(int j=0;j<numTopic;j++){		p = root->dcObj[i]->showStat(topic[j]);		stat = ((double) p)/((double) root->dcObj[i]->N);		if(stat > maxStat){		    maxStat = stat;		    maxTopic = j;		    maxp = p;		}	    }	    if(flag == 1 && (root->dcObj[i]->N) > t3){		N1 = maxp;		N2 = root->dcObj[i]->N;		P = maxStat;		R = ((double) N1)/((double) N3[maxTopic]);		tmpF = (2.0*P*R)/(P+R);		if(tmpF > F[maxTopic]){		    F[maxTopic] = tmpF;		    T[maxTopic] = maxp;		}	    	for(int j=0;j<lvl;j++){	    	    printf("  ");	    	}	        printf("[ %s:%.2lf ]\n",topic[maxTopic],maxStat);/*	    	for(int j=0;j<lvl;j++){	    	    printf("  ");	    	}	    	printf("[ ");		for(int j=0;j<numFeat;j++){ 		  double df;		  df = ((double) root->dcObj[i]->W[j])/((double) root->dcObj[i]->N);		  if(df > t2){		    printf("%s:%.2lf ",feature[j],df);		  }	  	}		printf("]\n");*/		cover = cover + root->dcObj[i]->N;		*pc = *pc + maxp;		(*ncl)++;	    }else if(root->dcObj[i]->N > t3){	     	cover = cover + showTree(root->child[i],lvl+1,pc,ncl);	    }else{		cover = 0;	    }    	}    }else{	int nd=0;    	for(int i=0;i<root->numChild;i++){	    nd = nd + root->dcObj[i]->N;	}	for(int j=0;j<lvl;j++){	    printf("  ");	}    	printf("level=%d numChild=%d numDoc=%d\n",lvl,root->numChild,nd);	cover = 0;	for(int i=0;i<root->numChild;i++){	    int p,maxp,maxTopic;	    double stat,maxStat;	    for(int j=0;j<lvl;j++){	    	printf("  ");	    }	    printf("[ ");	    int flag=0;	    for(int j=0;j<numFeat;j++){ 		double df;		df = ((double) root->dcObj[i]->W[j])/((double) root->dcObj[i]->N);		if(df > t2){		    printf("%s:%.2lf ",feature[j],df);		    flag = 1;		}	    }	    printf("]\n");	    maxStat = -1.0;	    maxTopic = 0;	    for(int j=0;j<lvl;j++){	    	printf("  ");	    }	    printf("DC %d: numDoc=%d\n",i,root->dcObj[i]->N);	    for(int j=0;j<numTopic;j++){		p = root->dcObj[i]->showStat(topic[j]);		stat = ((double) p)/((double) root->dcObj[i]->N);		if(stat > maxStat){		    maxStat = stat;		    maxTopic = j;		    maxp = p;		}	    }	    if(flag == 1 && root->dcObj[i]->N > t3){		N1 = maxp;		N2 = root->dcObj[i]->N;		P = maxStat;		R = ((double) N1)/((double) N3[maxTopic]);		tmpF = (2.0*P*R)/(P+R);		if(tmpF > F[maxTopic]){		    F[maxTopic] = tmpF;		}	    	for(int j=0;j<lvl;j++){	    	    printf("  ");	    	}		printf("[ %s:%.2lf ]\n",topic[maxTopic],maxStat);		cover = cover + root->dcObj[i]->N;		*pc = *pc + maxp;		(*ncl)++;/*	    	for(int j=0;j<lvl;j++){	    	    printf("  ");	    	}	    	printf("[ ");		for(int j=0;j<numFeat;j++){ 		  double df;		  df = ((double) root->dcObj[i]->W[j])/((double) root->dcObj[i]->N);		  if(df > t2){		    printf("%s:%.2lf ",feature[j],df);		  }	  	}		printf("]\n");*/	    }	    dNode *dl;	    dl = root->dcObj[i]->docList;	    while(dl != NULL){	    	for(int j=0;j<lvl+1;j++){	    	    printf("  ");	    	}		printf("Doc %d %s\n",dl->ID,dl->label);		dl = dl->next;	    }	}    }    return cover;}int printTree(node *root,int lvl,int *h){    int numClust;    if(root->isLeaf != 1){    	printf("level=%d numChild=%d\n",lvl,root->numChild);    	for(int i=0;i<root->numChild;i++){	  if(root->dcObj[i]->N >= 20 && root->dcObj[i]->N <=100){	    printf("child %d: numDoc=%d\n",i,root->dcObj[i]->N);	    if(lvl > 0){		for(int j=0;j<numFeat;j++){ 		  double t;		  t = ((double) root->dcObj[i]->W[j])/((double) root->dcObj[i]->N);		  if(t > 0.65){		    printf("%s:%d ",feature[j],root->dcObj[i]->W[j]);		  }	  	}		printf("\n");	    }	  }    	}	numClust = 0;    	for(int i=0;i<root->numChild;i++){	    numClust = numClust+printTree(root->child[i],lvl+1,h);	}    }else{//    	printf("level=%d numChild=%d\n",lvl,root->numChild);	*h = lvl;        numClust = root->numChild;/*	for(int i=0;i<root->numChild;i++){	    printf("DC %d: numDoc=%d\n",i,root->dcObj[i]->N);	    dNode *dl;	    dl = root->dcObj[i]->docList;	    while(dl != NULL){		printf("  Doc %d label=%s\n",dl->ID,dl->label);		dl = dl->next;	    }	}*/    }    return numClust;}int main(int argc,char **argv){    FILE *input,*featFile;    clock_t e_start,e_end,e_diff;    double e_time;    int coverage,pCover,numClust;    // time_t r_start,r_end,r_time;    /* checking input format */    if(argc!=8){        cerr << "Usage: " << argv[0] << " inputFile minChild maxChild simThreshold t1 t2 t3\n";        exit(1);    }    /* open input and output file     */    if((input=fopen(argv[1],"r")) == NULL){        cerr << "Cannot open the input file: " << argv[1] << endl;        exit(1);    }    if((featFile=fopen("./feature","r")) == NULL){        cerr << "Cannot open the feature file: feature\n";        exit(1);    }    if((topicFile=fopen("./topic","r")) == NULL){        cerr << "Cannot open the topic file: topic\n";        exit(1);    }    /* handle input parameters     */    fscanf(input,"%d %d\n",&numDoc,&numFeat);    fscanf(topicFile,"%d\n",&numTopic);    minChild = atoi(argv[2]);    maxChild = atoi(argv[3]);    simThres = atof(argv[4]);    t1 = atof(argv[5]);    t2 = atof(argv[6]);    t3 = atoi(argv[7]);    node *dctree,*retNode;    DC *newDC,*dc[numDoc];    char label[MaxLabelLen];    int featVect[numFeat];    topic = new (char *)[numTopic];    for(int i=0;i<numTopic;i++){	topic[i] = new char[MaxLabelLen];	fscanf(topicFile,"%s",topic[i]);	    }    feature = new (char *)[numFeat];    for(int i=0;i<numFeat;i++){	feature[i] = new char[MaxLabelLen];	fscanf(featFile,"%s",feature[i]);	    }    N3 = new int[numTopic];    T = new int[numTopic];    F = new double[numTopic];    for(int i=0;i<numTopic;i++){	N3[i] = 0;	T[i] = 0;	F[i] = 0.0;    }    /* initilaize the root node of the tree     */    dctree = (node *)malloc(sizeof(node));    dctree->isLeaf = 1;    dctree->numChild = 0;    dctree->child = new (node *)[maxChild];    dctree->dcObj = new (DC *)[maxChild];    for(int i=0;i<maxChild;i++){	dctree->child[i] = NULL;	dctree->dcObj[i] = NULL;    }    e_start = clock();    for(int i=0;i<numDoc;i++){	/* get data from the input file	*/	for(int j=0;j<numFeat;j++){	    fscanf(input,"%d",&featVect[j]);	}	fscanf(input,"%s",label);	newDC = new DC(i,featVect,numFeat,label);	for(int j=0;j<numTopic;j++){	    if(strcmp(topic[j],label) == 0){		(N3[j])++;	    }	}	retNode = insert(&dctree,newDC);	/* split the root node	*/	if(retNode != NULL){	    node *newRoot;	    newRoot = (node *)malloc(sizeof(node));	    newRoot->isLeaf = 0;            newRoot->numChild = 2;            newRoot->child = new (node *)[maxChild];            newRoot->dcObj = new (DC *)[maxChild];            for(int j=0;j<maxChild;j++){                newRoot->child[j] = NULL;                newRoot->dcObj[j] = NULL;            }	    newRoot->child[0] = dctree;	    newRoot->dcObj[0] = new DC(-1,NULL,numFeat,NULL);	    for(int j=0;j<dctree->numChild;j++){	        newRoot->dcObj[0]->merge(dctree->dcObj[j]);	    }	    newRoot->child[1] = retNode;	    newRoot->dcObj[1] = new DC(-1,NULL,numFeat,NULL);	    for(int j=0;j<retNode->numChild;j++){	        newRoot->dcObj[1]->merge(retNode->dcObj[j]);	    }	    dctree = newRoot;	}    }    e_end = clock();    e_diff = e_end - e_start;    e_time = ((double) e_diff)/((double) CLOCKS_PER_SEC);      pCover = 0;    numClust = 0;    height = 0;    coverage = showTree(dctree,0,&pCover,&numClust);    printf("Time is %.2lf sec\n",e_time);    printf("Height is %d\n",height);    printf("numCluster is %d\n",numClust);    printf("coverage is %d\n",coverage);    printf("p_coverage is %d\n",pCover);        int B=0;    double A =0.0;    for(int i=0;i<numTopic;i++){	A = A + ((double) N3[i])*(F[i]);	B = B + N3[i];    }    printf("Overall F-measure is %.4lf\n",A/((double) B));    fclose(input);    return 1;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -