⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cluster.c

📁 层次聚类算法中的cure算法
💻 C
📖 第 1 页 / 共 2 页
字号:
/* * cluster.c -- cluster main program * * $Log: cluster.c,v $ * Revision 1.23  1993/02/03  07:43:07  stolcke * added code vector output for cluster * * Revision 1.22  1993/01/20  10:28:02  stolcke * faster O(n^2) clustering * * Revision 1.21  1993/01/16  01:44:34  stolcke * use triangular distance matrix (save 1/2 the memory) * * Revision 1.20  1993/01/16  00:27:07  stolcke * make dflag printing more efficient * * Revision 1.19  1993/01/16  00:03:30  stolcke * avoid unnecessary reallocation of patterns * skip non-root distance entries more efficiently * * Revision 1.18  1992/02/12  07:13:13  stolcke * -E option added * * Revision 1.17  91/07/15  12:56:00  stolcke * width option added  *  * Revision 1.16  91/07/14  01:10:54  stolcke * curses support added * graphing routines moved to separate file *  * Revision 1.15  91/07/10  21:26:54  stolcke * saber-cleaned and bad bug in allocation macro fixed *  * Revision 1.14  91/04/20  16:17:59  stolcke * second release (beta) *  * Revision 1.13  91/04/18  19:03:39  stolcke * some more error checks *  * Revision 1.12  91/04/18  18:28:21  stolcke * general cleanup and partial rewrite *  * Revision 1.11  91/04/18  13:29:29  stolcke * merged pca into cluster * * Revision 1.10  91/04/12  02:09:48  stolcke * name quoting added * * Revision 1.9  90/12/06  21:47:46  stolcke * fixed averaging with D/C values * * Revision 1.8  90/12/06  14:32:52  stolcke * support for don't care values added * fixed bug when total_distance = 0 * * Revision 1.7  90/11/12  20:31:22  stolcke * two initialization fixed * * Revision 1.6  90/11/12  18:43:26  stolcke * workaround to avoid old Sun compiler bug * * Revision 1.5  90/11/12  16:40:34  stolcke * added metric selection and scaling * * Revision 1.4  90/11/11  18:14:27  stolcke * -n flag no longer needed * * Revision 1.3  90/11/11  02:30:14  stolcke * no more seek needed, can use in pipe now * * Revision 1.2  90/11/10  23:44:02  stolcke * added new features (see manpage) * */#if !defined(lint) && !defined(SABER)static char rcsid[] = "$Header: /usr/src/local/conn/cluster/RCS/cluster.c,v 1.23 1993/02/03 07:43:07 stolcke Exp $";#endif				/* not lint */#include <stdio.h>#include <string.h>#include <math.h>#include "alloc.h"#include "error.h"#include "cluster.h"#define NONE (-2)#define BUFSIZE 256#ifndef SCALE#define SCALE "_SCALE_"#endif#ifndef DONTCARE#define DONTCARE "D/C"#endif#ifndef MAXFLOAT#define     MAXFLOAT        ((float)3.40282346638528860e+38)#endifstatic FLOAT   distance();static FLOAT   root();static FLOAT   cure_distance();static void    merge();static void    repInTree();static void print_rep_points();static char buffer[BUFSIZE];	/* temporary string read buffer */#define PCA	"pca"char   *program;		/* program name */int     vflag = 0;		/* print explanatory messages */static int pflag = 0;		/* do PCA instead of HCA */static int dflag = 0;		/* print distances */static int tflag = 0;		/* print tree in ASCII */static int Tflag = 0;		/* display tree using curses */static int gflag = 0;		/* print tree in graph(1) format */static int bflag = 0;		/* print tree in graph(1) format, with breaks */static int Bflag = 0;		/* print patterns as bit vectors */static int sflag = 0;		/* suppress scaling  */int	Eflag = 0;		/* print eigenvalues */static int width = 0;		/* width of ASCII tree representation */static int norm = 2;		/* which l-norm to use for distance metric */static int csize = 0; 		/* number of clusters */static int rsize = 10; 		/* number of rep points in a cluster */static float alpha = 0.3;	/* shrinking factor alpha */static usage(){    if (!pflag)	fprintf(stderr,	   "usage: %s [-dtTgbBsv] [-w width] [-n norm] [-k no-cluster] [-r rep-size] [-a alpha] [vectorfile [namesfile]]\n", program);    else	fprintf(stderr,		"usage: %s [-Esv] [-e eigenbase] [-c pcs] [vectorfile [namesfile]]\n", PCA);    exit(2);}main(argc, argv)    int     argc;    char   *argv[];{    FILE   *fp, *pfile;    char   *efile = NULL;    char   *comps = NULL;    FLOAT **pattern = NULL;    int     lpat, npat;    char  **name = NULL;    int     opt;    extern char *optarg;    extern int optind;    int *pinfo, i;    char fname[80];    /* who are we ? */    if ((program = strrchr(argv[0], '/')) == NULL)	program = argv[0];    else	program += 1;    if (strcmp(program, PCA) == 0)	pflag = 1;    while ((opt = getopt(argc, argv, "pdtTgbBvsn:a:r:k:e:c:w:E")) != -1)	switch (opt) {	case 'p':	    pflag = 1;	    break;	case 'd':	    if (pflag)		usage();	    dflag = 1;	    break;	case 't':	    if (pflag)		usage();	    tflag = 1;	    break;	case 'T':	    if (pflag)		usage();	    Tflag = 1;	    break;	case 'g':	    if (pflag)		usage();	    gflag = 1;	    break;	case 'b':	    if (pflag)		usage();	    bflag = 1;	    gflag = 1;	    break;	case 'B':	    if (pflag)		usage();	    Bflag = 1;	    break;	case 'v':	    vflag = 1;	    break;	case 'n':	    if (pflag)		usage();	    if (sscanf(optarg, "%d", &norm) != 1 || norm < 0)		usage();	    break;	case 'k':	    if (pflag)		usage();	    if (sscanf(optarg, "%d", &csize) != 1 || csize < 0)		usage();	    break;	case 'a':	    if (pflag)		usage();	    if (sscanf(optarg, "%f", &alpha) != 1 || alpha < 0 || alpha > 1.0)		usage();	    break;	case 'r':	    if (pflag)		usage();	    if (sscanf(optarg, "%d", &rsize) != 1 || rsize < 0)		usage();	    break;	case 'e':	    if (!pflag)		usage();	    efile = optarg;	    break;	case 'c':	    if (!pflag)		usage();	    comps = optarg;	    break;	case 's':	    sflag = 1;	    break;	case 'w':	    if (pflag)		usage();	    if (sscanf(optarg, "%d", &width) != 1 || width <= 0)		usage();	    break;	case 'E':	    if (!pflag)		usage();	    Eflag = 1;	    break;	case '?':	    usage();	    break;	}    if (!(pflag || dflag || tflag || Tflag || gflag || Bflag))	dflag = tflag = vflag = 1;	/* default behavior */    if (optind + 2 < argc)	usage();    if (!(optind < argc) || !strcmp(argv[optind], "-"))	fp = stdin;    else	IfErr(fp = fopen(argv[optind], "r")) {	    fprintf(stderr, "%s: cannot open file %s\n", argv[0], argv[optind]);	    exit(1);	}    IfErr(read_pattern(fp, &pattern, &lpat, &npat, &name)) {	fprintf(stderr, "%s: %s: cannot read pattern\n", program, ERR_MSG);	exit(1);    }    if (vflag)	fprintf(stderr, "read %d patterns:  size = %d\n", npat, lpat);    if (optind + 1 < argc) {	IfErr (name = new_array_of(npat, char *)) {;	    fprintf(stderr, "%s: not enough core for name array\n", program);	    exit(1);	}	if (!strcmp(argv[optind + 1], "-"))	    fp = stdin;	else	    IfErr(fp = fopen(argv[optind + 1], "r")) {		fprintf(stderr, "%s: cannot open file %s\n",			program, argv[optind + 1]);		exit(1);	    }	IfErr(read_names(fp, name, npat)) {	    fprintf(stderr, "%s: %s: cannot read names\n", program, ERR_MSG);	    exit(1);	}    }    IfErr(pinfo = new_array_of(npat, int)) {            fprintf(stderr, "%s: not enough core\n", program);            exit(1);    }    sprintf(fname, "%s-partition", argv[optind]);    IfErr(pfile = fopen(fname, "w")) {            fprintf(stderr, "%s: cannot open file %s\n", argv[0], argv[optind]);            exit(1);    }    cluster(pattern, name, lpat, npat, pinfo);    for (i=0; i<npat; ++i)        fprintf(pfile, "%d\n", pinfo[i]);    free(pinfo);    fclose(pfile);    exit(0);}/* skip blanks and next end of line */skip_blanks(fp)    FILE   *fp;{    char    c;    while ((c = getc(fp)) == ' ' || c == '\t');    if (c != '\n')	ungetc(c, fp);    return c;}read_names(fp, name, npat)    FILE   *fp;    char  **name;    int     npat;{    register int i;    for (i = 0; i < npat; i++) {	char    c = skip_blanks(fp);	if (c == '\"') {	    getc(fp);	    fgets(buffer, sizeof(buffer), fp);	    buffer[strlen(buffer) - 1] = '\0';	}	else {	    IfEOF(fscanf(fp, "%s", buffer))		Erreturn("not enough names");	    skip_blanks(fp);	}	IfErr(name[i] = new_string(buffer))	    Erreturn("not enough core");    }    return MY_OK;}voidprint_names(tree, name)    BiTree *tree;    char  **name;{    if (tree->leaf != LEAF) {	if (name)	    printf(" %s", name[tree->leaf]);	else	    printf(" %d", tree->leaf);    }    else {	print_names(tree->r_tree, name);	print_names(tree->l_tree, name);    }}voidprint_k_cluster(tree, name, pinfo, cno)    BiTree *tree;    char  **name;    int   *pinfo;    int   cno;{    if (tree->leaf != LEAF) {/*        if (name)            printf("%d : %s\n", tree->leaf, name[tree->leaf]);        else            printf("%d : %d\n", tree->leaf, tree->leaf);*/        pinfo[tree->leaf] = cno;    }    else {        print_k_cluster(tree->r_tree, name, pinfo, cno);        print_k_cluster(tree->l_tree, name, pinfo, cno);    }}find_nnb(items, which, lpat, index, ndist)    BiTree *items;	/* node array */    int which;		/* index of node to find nearest neightbor for */    int lpat;		/* pattern length */    int *index;		/* returns: index of nnb (-1 if none) */    FLOAT *ndist;	/* returns: distance to nnb */{    int i;    FLOAT dist, min_dist;    int min_index;    if (items[which].root == FALSE) {	*index = NONE;	return;    }    min_index = NONE;    min_dist = 0.0;    /*     * find minimum distance neighbor -- to avoid duplication     * only pairs with 1st index < 2nd index are considered     */    for (i = 0; i < which; i++) {	if (items[i].root == FALSE || items[i].pruned == TRUE)	    continue;		dist = cure_distance(&items[which], &items[i], lpat);	if (min_index == NONE || dist < min_dist) {	    min_index = i;	    min_dist = dist;	}    }    *index = min_index;    if (min_index >= 0)	*ndist = min_dist;    return;}#define FirstPruneRatio 0.3333#define SecondPruneMulti 3#define FirstCut 3#define SecondCut 10cluster(pattern, name, lpat, npat, pinfo)    FLOAT **pattern;    char  **name;    int     lpat, npat;    int   *pinfo;{    register int i, j, k, nodes = npat, cno;    BiTree *item = new_array_of(npat, BiTree);    /*     * for each data point or cluster center, we keep the index of the nearest     * neighbor, as well as the distance to it.     */    int *nnb_index = new_array_of(npat, int);    FLOAT *nnb_dist = new_array_of(npat, FLOAT);    if (item == NULL || nnb_index == NULL || nnb_dist == NULL) {	    fprintf(stderr, "%s: not enough core\n", program);	    exit(1);    }    /*     * initialize leaf nodes     */    for (i = 0; i < npat; i++) {	item[i].pat = pattern[i];	IfErr (item[i].mean = new_array_of(lpat, FLOAT)) {	    fprintf(stderr, "%s: not enough core\n", program);	    exit(1);	}	memcpy(item[i].mean, item[i].pat, lpat*sizeof(FLOAT));	IfErr (item[i].rep = new_array_of(rsize, FLOAT *)) {	    fprintf(stderr, "%s: not enough core\n", program);	    exit(1);	}	for (j=0; j < rsize; ++j) {		IfErr (item[i].rep[j] = new_array_of(lpat, FLOAT)) {	    		fprintf(stderr, "%s: not enough core\n", program);	    		exit(1);		}	}	memcpy(item[i].rep[0], item[i].pat, lpat*sizeof(FLOAT));	item[i].root = TRUE;	item[i].size = 1;	item[i].leaf = i;	item[i].distance = 0.0;	item[i].l_tree = item[i].r_tree = NULL;    }    /*     * initialize nearest neighbors     */    for (i = 0; i < npat; i++) {	find_nnb(item, i, lpat, &nnb_index[i], &nnb_dist[i]);/*printf("%d : %d %f %f %f\n", i, nnb_index[i], nnb_dist[i], item[i].rep[0][0], item[i].rep[0][1]);*/    }    /*     * cluster until done     */    while (nodes > csize) {	BiTree  *newitem;	FLOAT   dist, min_dist;	int     pair1, pair2;		/*	 * find minimum distance pair	 */	pair1 = NONE;	min_dist = 0.0;	for (i = 0; i < npat; i++) {	    if (item[i].root == FALSE || item[i].pruned == TRUE)		continue;	    if (nnb_index[i] != NONE &&		(pair1 == NONE || nnb_dist[i] < min_dist)) {		pair1 = i;		pair2 = nnb_index[i];		min_dist = nnb_dist[i];	    }	}	if (pair1 == NONE)	    break;		/* analysis finished */	min_dist = root(min_dist);	if (dflag) {	    printf("minimum distance = %f\t(", (float)min_dist);	    print_names (&item[pair1], name);	    printf(" )\t(");	    print_names (&item[pair2], name);	    printf(" )\n");

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -