📄 cluster.c
字号:
/* * cluster.c -- cluster main program * * $Log: cluster.c,v $ * Revision 1.23 1993/02/03 07:43:07 stolcke * added code vector output for cluster * * Revision 1.22 1993/01/20 10:28:02 stolcke * faster O(n^2) clustering * * Revision 1.21 1993/01/16 01:44:34 stolcke * use triangular distance matrix (save 1/2 the memory) * * Revision 1.20 1993/01/16 00:27:07 stolcke * make dflag printing more efficient * * Revision 1.19 1993/01/16 00:03:30 stolcke * avoid unnecessary reallocation of patterns * skip non-root distance entries more efficiently * * Revision 1.18 1992/02/12 07:13:13 stolcke * -E option added * * Revision 1.17 91/07/15 12:56:00 stolcke * width option added * * Revision 1.16 91/07/14 01:10:54 stolcke * curses support added * graphing routines moved to separate file * * Revision 1.15 91/07/10 21:26:54 stolcke * saber-cleaned and bad bug in allocation macro fixed * * Revision 1.14 91/04/20 16:17:59 stolcke * second release (beta) * * Revision 1.13 91/04/18 19:03:39 stolcke * some more error checks * * Revision 1.12 91/04/18 18:28:21 stolcke * general cleanup and partial rewrite * * Revision 1.11 91/04/18 13:29:29 stolcke * merged pca into cluster * * Revision 1.10 91/04/12 02:09:48 stolcke * name quoting added * * Revision 1.9 90/12/06 21:47:46 stolcke * fixed averaging with D/C values * * Revision 1.8 90/12/06 14:32:52 stolcke * support for don't care values added * fixed bug when total_distance = 0 * * Revision 1.7 90/11/12 20:31:22 stolcke * two initialization fixed * * Revision 1.6 90/11/12 18:43:26 stolcke * workaround to avoid old Sun compiler bug * * Revision 1.5 90/11/12 16:40:34 stolcke * added metric selection and scaling * * Revision 1.4 90/11/11 18:14:27 stolcke * -n flag no longer needed * * Revision 1.3 90/11/11 02:30:14 stolcke * no more seek needed, can use in pipe now * * Revision 1.2 90/11/10 23:44:02 stolcke * added new features (see manpage) * */#if !defined(lint) && !defined(SABER)static char rcsid[] = "$Header: /usr/src/local/conn/cluster/RCS/cluster.c,v 1.23 1993/02/03 07:43:07 stolcke Exp $";#endif /* not lint */#include <stdio.h>#include <string.h>#include <math.h>#include "alloc.h"#include "error.h"#include "cluster.h"#define NONE (-2)#define BUFSIZE 256#ifndef SCALE#define SCALE "_SCALE_"#endif#ifndef DONTCARE#define DONTCARE "D/C"#endif#ifndef MAXFLOAT#define MAXFLOAT ((float)3.40282346638528860e+38)#endifstatic FLOAT distance();static FLOAT root();static FLOAT cure_distance();static void merge();static void repInTree();static void print_rep_points();static char buffer[BUFSIZE]; /* temporary string read buffer */#define PCA "pca"char *program; /* program name */int vflag = 0; /* print explanatory messages */static int pflag = 0; /* do PCA instead of HCA */static int dflag = 0; /* print distances */static int tflag = 0; /* print tree in ASCII */static int Tflag = 0; /* display tree using curses */static int gflag = 0; /* print tree in graph(1) format */static int bflag = 0; /* print tree in graph(1) format, with breaks */static int Bflag = 0; /* print patterns as bit vectors */static int sflag = 0; /* suppress scaling */int Eflag = 0; /* print eigenvalues */static int width = 0; /* width of ASCII tree representation */static int norm = 2; /* which l-norm to use for distance metric */static int csize = 0; /* number of clusters */static int rsize = 10; /* number of rep points in a cluster */static float alpha = 0.3; /* shrinking factor alpha */static usage(){ if (!pflag) fprintf(stderr, "usage: %s [-dtTgbBsv] [-w width] [-n norm] [-k no-cluster] [-r rep-size] [-a alpha] [vectorfile [namesfile]]\n", program); else fprintf(stderr, "usage: %s [-Esv] [-e eigenbase] [-c pcs] [vectorfile [namesfile]]\n", PCA); exit(2);}main(argc, argv) int argc; char *argv[];{ FILE *fp, *pfile; char *efile = NULL; char *comps = NULL; FLOAT **pattern = NULL; int lpat, npat; char **name = NULL; int opt; extern char *optarg; extern int optind; int *pinfo, i; char fname[80]; /* who are we ? */ if ((program = strrchr(argv[0], '/')) == NULL) program = argv[0]; else program += 1; if (strcmp(program, PCA) == 0) pflag = 1; while ((opt = getopt(argc, argv, "pdtTgbBvsn:a:r:k:e:c:w:E")) != -1) switch (opt) { case 'p': pflag = 1; break; case 'd': if (pflag) usage(); dflag = 1; break; case 't': if (pflag) usage(); tflag = 1; break; case 'T': if (pflag) usage(); Tflag = 1; break; case 'g': if (pflag) usage(); gflag = 1; break; case 'b': if (pflag) usage(); bflag = 1; gflag = 1; break; case 'B': if (pflag) usage(); Bflag = 1; break; case 'v': vflag = 1; break; case 'n': if (pflag) usage(); if (sscanf(optarg, "%d", &norm) != 1 || norm < 0) usage(); break; case 'k': if (pflag) usage(); if (sscanf(optarg, "%d", &csize) != 1 || csize < 0) usage(); break; case 'a': if (pflag) usage(); if (sscanf(optarg, "%f", &alpha) != 1 || alpha < 0 || alpha > 1.0) usage(); break; case 'r': if (pflag) usage(); if (sscanf(optarg, "%d", &rsize) != 1 || rsize < 0) usage(); break; case 'e': if (!pflag) usage(); efile = optarg; break; case 'c': if (!pflag) usage(); comps = optarg; break; case 's': sflag = 1; break; case 'w': if (pflag) usage(); if (sscanf(optarg, "%d", &width) != 1 || width <= 0) usage(); break; case 'E': if (!pflag) usage(); Eflag = 1; break; case '?': usage(); break; } if (!(pflag || dflag || tflag || Tflag || gflag || Bflag)) dflag = tflag = vflag = 1; /* default behavior */ if (optind + 2 < argc) usage(); if (!(optind < argc) || !strcmp(argv[optind], "-")) fp = stdin; else IfErr(fp = fopen(argv[optind], "r")) { fprintf(stderr, "%s: cannot open file %s\n", argv[0], argv[optind]); exit(1); } IfErr(read_pattern(fp, &pattern, &lpat, &npat, &name)) { fprintf(stderr, "%s: %s: cannot read pattern\n", program, ERR_MSG); exit(1); } if (vflag) fprintf(stderr, "read %d patterns: size = %d\n", npat, lpat); if (optind + 1 < argc) { IfErr (name = new_array_of(npat, char *)) {; fprintf(stderr, "%s: not enough core for name array\n", program); exit(1); } if (!strcmp(argv[optind + 1], "-")) fp = stdin; else IfErr(fp = fopen(argv[optind + 1], "r")) { fprintf(stderr, "%s: cannot open file %s\n", program, argv[optind + 1]); exit(1); } IfErr(read_names(fp, name, npat)) { fprintf(stderr, "%s: %s: cannot read names\n", program, ERR_MSG); exit(1); } } IfErr(pinfo = new_array_of(npat, int)) { fprintf(stderr, "%s: not enough core\n", program); exit(1); } sprintf(fname, "%s-partition", argv[optind]); IfErr(pfile = fopen(fname, "w")) { fprintf(stderr, "%s: cannot open file %s\n", argv[0], argv[optind]); exit(1); } cluster(pattern, name, lpat, npat, pinfo); for (i=0; i<npat; ++i) fprintf(pfile, "%d\n", pinfo[i]); free(pinfo); fclose(pfile); exit(0);}/* skip blanks and next end of line */skip_blanks(fp) FILE *fp;{ char c; while ((c = getc(fp)) == ' ' || c == '\t'); if (c != '\n') ungetc(c, fp); return c;}read_names(fp, name, npat) FILE *fp; char **name; int npat;{ register int i; for (i = 0; i < npat; i++) { char c = skip_blanks(fp); if (c == '\"') { getc(fp); fgets(buffer, sizeof(buffer), fp); buffer[strlen(buffer) - 1] = '\0'; } else { IfEOF(fscanf(fp, "%s", buffer)) Erreturn("not enough names"); skip_blanks(fp); } IfErr(name[i] = new_string(buffer)) Erreturn("not enough core"); } return MY_OK;}voidprint_names(tree, name) BiTree *tree; char **name;{ if (tree->leaf != LEAF) { if (name) printf(" %s", name[tree->leaf]); else printf(" %d", tree->leaf); } else { print_names(tree->r_tree, name); print_names(tree->l_tree, name); }}voidprint_k_cluster(tree, name, pinfo, cno) BiTree *tree; char **name; int *pinfo; int cno;{ if (tree->leaf != LEAF) {/* if (name) printf("%d : %s\n", tree->leaf, name[tree->leaf]); else printf("%d : %d\n", tree->leaf, tree->leaf);*/ pinfo[tree->leaf] = cno; } else { print_k_cluster(tree->r_tree, name, pinfo, cno); print_k_cluster(tree->l_tree, name, pinfo, cno); }}find_nnb(items, which, lpat, index, ndist) BiTree *items; /* node array */ int which; /* index of node to find nearest neightbor for */ int lpat; /* pattern length */ int *index; /* returns: index of nnb (-1 if none) */ FLOAT *ndist; /* returns: distance to nnb */{ int i; FLOAT dist, min_dist; int min_index; if (items[which].root == FALSE) { *index = NONE; return; } min_index = NONE; min_dist = 0.0; /* * find minimum distance neighbor -- to avoid duplication * only pairs with 1st index < 2nd index are considered */ for (i = 0; i < which; i++) { if (items[i].root == FALSE || items[i].pruned == TRUE) continue; dist = cure_distance(&items[which], &items[i], lpat); if (min_index == NONE || dist < min_dist) { min_index = i; min_dist = dist; } } *index = min_index; if (min_index >= 0) *ndist = min_dist; return;}#define FirstPruneRatio 0.3333#define SecondPruneMulti 3#define FirstCut 3#define SecondCut 10cluster(pattern, name, lpat, npat, pinfo) FLOAT **pattern; char **name; int lpat, npat; int *pinfo;{ register int i, j, k, nodes = npat, cno; BiTree *item = new_array_of(npat, BiTree); /* * for each data point or cluster center, we keep the index of the nearest * neighbor, as well as the distance to it. */ int *nnb_index = new_array_of(npat, int); FLOAT *nnb_dist = new_array_of(npat, FLOAT); if (item == NULL || nnb_index == NULL || nnb_dist == NULL) { fprintf(stderr, "%s: not enough core\n", program); exit(1); } /* * initialize leaf nodes */ for (i = 0; i < npat; i++) { item[i].pat = pattern[i]; IfErr (item[i].mean = new_array_of(lpat, FLOAT)) { fprintf(stderr, "%s: not enough core\n", program); exit(1); } memcpy(item[i].mean, item[i].pat, lpat*sizeof(FLOAT)); IfErr (item[i].rep = new_array_of(rsize, FLOAT *)) { fprintf(stderr, "%s: not enough core\n", program); exit(1); } for (j=0; j < rsize; ++j) { IfErr (item[i].rep[j] = new_array_of(lpat, FLOAT)) { fprintf(stderr, "%s: not enough core\n", program); exit(1); } } memcpy(item[i].rep[0], item[i].pat, lpat*sizeof(FLOAT)); item[i].root = TRUE; item[i].size = 1; item[i].leaf = i; item[i].distance = 0.0; item[i].l_tree = item[i].r_tree = NULL; } /* * initialize nearest neighbors */ for (i = 0; i < npat; i++) { find_nnb(item, i, lpat, &nnb_index[i], &nnb_dist[i]);/*printf("%d : %d %f %f %f\n", i, nnb_index[i], nnb_dist[i], item[i].rep[0][0], item[i].rep[0][1]);*/ } /* * cluster until done */ while (nodes > csize) { BiTree *newitem; FLOAT dist, min_dist; int pair1, pair2; /* * find minimum distance pair */ pair1 = NONE; min_dist = 0.0; for (i = 0; i < npat; i++) { if (item[i].root == FALSE || item[i].pruned == TRUE) continue; if (nnb_index[i] != NONE && (pair1 == NONE || nnb_dist[i] < min_dist)) { pair1 = i; pair2 = nnb_index[i]; min_dist = nnb_dist[i]; } } if (pair1 == NONE) break; /* analysis finished */ min_dist = root(min_dist); if (dflag) { printf("minimum distance = %f\t(", (float)min_dist); print_names (&item[pair1], name); printf(" )\t("); print_names (&item[pair2], name); printf(" )\n");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -