📄 cluster.c
字号:
//////////////////////////////////////////////////////////////////////////////////////////////
// 聚类处理函数
//////////////////////////////////////////////////////////////////////////////////////////////
#include <io.h>#include <ctype.h>#include <errno.h>#include <math.h>#include <stdarg.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#define BUFSIZE 2048char buffer [BUFSIZE];
char *no_mem_buffer;
#define MIN(A, B) ((A) < (B) ? (A) : (B))#define MAX(A, B) ((A) > (B) ? (A) : (B))typedef struct{ int used; int n_items; int cl1; int cl2; int cluster; float f;} CLUSTER;CLUSTER *cl;char **labels;
int *LDNum;
float **diff;
int size;int input_line = 0;int sorted = 1;char *programname;char Out_of_memory [] = "Out of memory";
/*********************************************************************************
* 其它功能函数
*********************************************************************************/
void errit (char const *format, ...)
{
va_list list;
fprintf (stderr, "\nError %s: ", programname);
va_start (list, format);
vfprintf (stderr, format, list);
fprintf (stderr, "\n\n");
exit (1);
}
char const *SortCluster (int i)
{
int n;
char const *s1, *s2;
if (cl [i].cl1 < size) s1 = labels [cl [i].cl1];
else s1 = SortCluster (cl [i].cl1);
if (cl [i].cl2 < size) s2 = labels [cl [i].cl2];
else s2 = SortCluster (cl [i].cl2);
if (strcmp (s1, s2) > 0) {
n = cl [i].cl1;
cl [i].cl1 = cl [i].cl2;
cl [i].cl2 = n;
return s2;
}
else return s1;
}
void *s_malloc (size_t size)
{
void *p;
p = malloc (size);
if (! p) {
free (no_mem_buffer);
errit (Out_of_memory);
}
return p;
}
char *s_strdup (const char *s)
{
char *p = strdup (s);
if (! p) {
free (no_mem_buffer);
errit (Out_of_memory);
}
return p;
}
int getline (FILE *fp, int required, char const *filename)
{
int i;
for (;;) {
/* 从文件中读一字符串 */
if (fgets (buffer, BUFSIZE, fp) == NULL) {
if (required) errit ("Unexpected end of file in \"%s\"", filename);
else return 0;
}
input_line++;
i = strlen (buffer);
while (i && isspace (buffer [i - 1])) buffer [--i] = '\0';
i = 0;
while (buffer [i] && isspace (buffer [i])) i++;
if (buffer [i] == '#') continue;
if (buffer [i]) {
memmove (buffer, (buffer + i), strlen (buffer + i) + 1);
return 1;
}
}
}
char a_s [] = "Single";
char a_c [] = "Complete";
char a_g [] = "Group";
char a_a [] = "Average";
void syntax ()
{
fprintf (stderr, "\nSyntax: %s -s|-c|-g|-a [-u] [difference table file]\n\n"
"Clustering algorithm:\n"
" -s : %s\n"
" -c : %s\n"
" -g : %s\n"
" -a : %s\n"
" -u : unsorted\n\n",
programname,
a_s,
a_c,
a_g,
a_a);
exit (0);
}
/*********************************************************************************
* Cluster 距离的计算方法
*********************************************************************************/
// 1: Single Link (Nearest Neighbor) 最小距离
void update_s (int i)
{
int j, cl1, cl2;
cl1 = cl [i].cl1;
cl2 = cl [i].cl2;
/* 重新计算此合并类与其它类的距离值 ---- 取其中的最小值 */
diff [i][i] = 0.0;
for (j = 0; j < i; j++) {
// 选择两个合并聚类中最小的距离
diff [i][j] = diff [j][i] = MIN (diff [cl1][j], diff [cl2][j]);
}
/* 把合并的两个子类的类下标赋值为合并后的新类编码 */
for (j = 0; j < i; j++) {
if (cl [j].cluster == cl [i].cl1 || cl [j].cluster == cl [i].cl2 ) cl [j].cluster = i;
}
}
// 2: Complete Link 最大距离
void update_c (int i)
{
int j, cl1, cl2;
cl1 = cl [i].cl1;
cl2 = cl [i].cl2;
diff [i][i] = 0.0;
for (j = 0; j < i; j++) {
// 选择两个合并聚类中最大的距离
diff [i][j] = diff [j][i] = MAX (diff [cl1][j], diff [cl2][j]);
}
for (j = 0; j < i; j++) {
if (cl [j].cluster == cl [i].cl1 || cl [j].cluster == cl [i].cl2 ) cl [j].cluster = i;
}
}
// 3: Group Average 平均距离
void update_g (int i)
{
int j, cl1, cl2;
cl1 = cl [i].cl1;
cl2 = cl [i].cl2;
diff [i][i] = 0.0;
for (j = 0; j < i; j++) {
diff[j][i]=diff[i][j]=( ((float) cl [cl1].n_items) * diff [j][cl1] + ((float) cl [cl2].n_items) * diff [j][cl2] )
/ ((float) (cl [cl1].n_items + cl [cl2].n_items));
}
for (j = 0; j < i; j++) {
if (cl [j].cluster == cl [i].cl1 || cl [j].cluster == cl [i].cl2 ) cl [j].cluster = i;
}
}
// 4: Equal Weighted Average 平均值距离
void update_a (int i)
{
int j, cl1, cl2;
cl1 = cl [i].cl1;
cl2 = cl [i].cl2;
diff [i][i] = 0.0;
for (j = 0; j < i; j++) {
diff [j][i] = diff [i][j] = (float)( (diff [j][cl1] + diff [j][cl2]) * 0.5 );
}
for (j = 0; j < i; j++) {
if (cl [j].cluster == cl [i].cl1 || cl [j].cluster == cl [i].cl2 ) cl [j].cluster = i;
}
}
//////////////////////////////////////////////////////////////////////////////////////////////////////
// 主函数
//////////////////////////////////////////////////////////////////////////////////////////////////////
int main (int argc, char *argv []){ FILE *fp;
char *infile, *name_update;
int i,j,k;
int cl1,cl2; float d;
int TotalNum;
void (*update_function)(int); no_mem_buffer = (char *) malloc (1024); programname = argv[0]; // 判定距离计算函数 ////////////////////////////////////////////////
update_function = NULL; while (argc > 1 && argv [1][0] == '-') { if ( (!strcmp (argv [1], "-s")) ) { update_function = update_s; name_update = a_s; } else if (! strcmp (argv [1], "-c")) { update_function = update_c; name_update = a_c; } else if (! strcmp (argv [1], "-g")) { update_function = update_g; name_update = a_g; } else if (! strcmp (argv [1], "-a")) { update_function = update_a; name_update = a_a; } else if (! strcmp (argv [1], "-u")) { sorted = 0; } else
syntax ();
/* 读下一个命令行参数 */
argc--; argv++; }
// 如果距离计算函数为空,显示命令行格式出错 /////////////////////////////////
if (! update_function) syntax ();
// 打开数据文件 //////////////////////////////////////////////////////////// switch (argc) { case 1: if (isatty (fileno (stdin))) syntax (); fp = stdin; infile = "<stdin>"; break; case 2: infile = argv [1]; fp = fopen (infile, "r"); if (! fp) {
errit ("Opening file \"%s\": %s", infile, strerror (errno));
}
break; default: syntax (); }
// 读取数据条总数量 //////////////////////////////////////////////////////
getline (fp, 1, infile);
sscanf (buffer, "%d", &TotalNum);
// 读取数据条数量 ////////////////////////////////////////////////////////
getline (fp, 1, infile); if (sscanf (buffer, "%d", &size) != 1) { errit ("file \"%s\", line %i\nTable size expected", infile, input_line);
}
//printf(" # Labels_Num = %d \n",size);
// 读取 Label 名称 && 每个 Label 含有的数据实例 //////////////////////////
labels = (char **) s_malloc (size * sizeof (char *));
LDNum = (int *) calloc((size+1),sizeof(int)); for (i = 0; i < size; i++) { getline (fp, 1, infile);
labels [i] = s_strdup (buffer);
//printf("# labels[%d] = %s \n",i,labels [i]);
getline (fp, 1, infile);
LDNum[i] = (int)(atoi(&buffer[0]));
// 读取每个正常度的数据实例数目
}
// 读取Label之间的距离值 ///////////////////////////////////////////////// // 分配空间
diff = (float **) s_malloc ((2 * size - 1) * sizeof (float *)); for (i = 0; i < 2 * size - 1; i++) { diff [i] = (float *) s_malloc ((2 * size - 1) * sizeof (float));
}
// 距离取值矩阵
for (i = 0; i < size; i++) { diff [i][i] = 0.0; for (j = 0; j < i; j++) { getline (fp, 1, infile); if (sscanf (buffer, "%f", &d) != 1) { errit ("file \"%s\", line %i\nValue expected", infile, input_line);
}
diff [j][i] = diff [i][j] = d;
//printf("# diff[%d][%d] = %f \n",i,j,diff [i][j]);
} }
// 关闭数据文件 ////////////////////////////////////////////////////////// if (fp != stdin) fclose (fp); cl = (CLUSTER *) s_malloc ((2 * size - 1) * sizeof (CLUSTER)); // 每个数据实例一个聚类,记录下各个聚类的情况
for (i = 0; i < size; i++) { cl [i].used = 0; cl [i].n_items = 1; // ??? cl [i].cl1 = -1;
cl [i].cl2 = -1; cl [i].f = 0.0; cl [i].cluster = i; /* Label 的下标 */ }
// 进行聚类分析处理 /////////////////////////////////////////// for (i = size; i < (2 * size - 1) ; i++) { cl [i].used = 0; d = 1000000.00;
// 找出所有 used =0 的类别之间的最小距离 diff [j][k] /////
for (j = 0; j < i; j++) { if (! cl [j].used) { for (k = 0; k < j; k++) { if ((! cl [k].used) && (diff [j][k] < d)) { cl1 = j; cl2 = k; d = diff [j][k]; }
}
}
}
cl [i].n_items = cl [cl1].n_items + cl [cl2].n_items; cl [cl1].used = cl [cl2].used = 1; cl [i].cl1 = cl1; /* Cluster 1 的下标值 */ cl [i].cl2 = cl2; /* Cluster 2 的下标值 */ cl [i].f = d; /* 距离值 */ cl [i].cluster = i; update_function (i); /* i 是当前最后处理聚类的下标 */ } if (sorted) SortCluster (2 * size - 2);
// 数据输出 ///////////////////////////////////////////////////////////////
printf ("%d\n",TotalNum);
for (i = size; i < 2 * size - 1; i++) { printf ("%i %f\n", i, cl [i].f); /* < size 是具体的Label */
if (cl [i].cl1 < size) {
printf ("L %s\n", labels [cl [i].cl1]);
printf ("%d\n",LDNum[cl[i].cl1]); // 输出 label 中含有的数据实例数目
} /* > size 是一个类 */
else {
printf ("C %i\n", cl [i].cl1);
}
if (cl [i].cl2 < size) {
printf ("L %s\n", labels[cl [i].cl2]);
printf ("%d\n",LDNum[cl[i].cl2]); // 输出 label 中含有的数据实例数目
} else {
printf ("C %i\n", cl [i].cl2);
} }
return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -