📄 scluster.c
字号:
/*******************************************************************************
author : Bernhard Knab
filename : ghmm/ghmm/scluster.c
created : TIME: 15:47:54 DATE: Tue 16. November 1999
$Id: scluster.c,v 1.19 2003/12/19 15:06:17 cic99 Exp $
Copyright (C) 1998-2001, ZAIK/ZPR, Universit鋞 zu K鰈n
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*******************************************************************************/
#ifdef WIN32
# include "win_config.h"
#endif
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include <stdio.h>
#undef HAVE_LIBPTHREAD
#ifdef HAVE_LIBPTHREAD
# include <pthread.h>
#endif /* HAVE_LIBPTHREAD */
#include <float.h>
#include <math.h>
#include <time.h>
#include "mprintf.h"
#include "mes.h"
#include "scluster.h"
#include "smodel.h"
#include "sreestimate.h"
#include "sfoba.h"
#include "rng.h"
#include "sequence.h"
#include "const.h"
#include "matrix.h"
#include "vector.h"
#include "smap_classify.h"
#ifdef HAVE_LIBPTHREAD
/* switch for parallel mode: (1) = sequential, (0) = parallel */
# define POUT 0
/* number of parallel threads */
# define THREADS 4
#else
# define POUT 1
#endif /* HAVE_LIBPTHREAD */
#define POUT 1
/*============================================================================*/
int scluster_hmm(char* argv[]) {
# define CUR_PROC "scluster_hmm"
char *seq_file = argv[1], *smo_file = argv[2], *out_filename = argv[3];
int labels = atoi(argv[4]);
int res = -1, i, iter = 0, sqd_number, idummy;
sequence_d_t *sqd = NULL;
sequence_d_t **sqd_vec = NULL; /* only temp. pointer */
long j, changes = 1;
long *oldlabel, *smo_changed;
double log_p, log_apo;
double **all_log_p = NULL; /* for storing all log_p */
FILE *outfile = NULL;
char *str;
char filename[1024];
scluster_t cl;
double eps_bw, q;
int max_iter_bw;
/* sreestimate_baum_welch needs this structure (introduced for parallel mode) */
smosqd_t *cs;
#if POUT == 0
int *return_value;
pthread_t *tid;
int perror;
pthread_attr_t Attribute;
#endif /* POUT */
cl.smo = NULL;
cl.smo_seq = NULL;
cl.seq_counter = NULL;
cl.smo_Z_MD = NULL;
cl.smo_Z_MAW = NULL;
/* -----------------initialization -------------------------*/
fprintf(stdout, "Clustering seqs. \"%s\"\nwith ", seq_file);
if (CLASSIFY == 0) fprintf(stdout, "MD-Classification\n");
else fprintf(stdout, "MAW-Classification\n");
fflush(stdout);
/*-----------open output file----------------------------------*/
sprintf(filename, "%s%s", out_filename, ".cl");
if(!(outfile = mes_fopen(filename, "wt"))) {mes_proc(); goto STOP;}
/*--------------Memory allocation and data reading----------------------------*/
/* 1 sequence array and initial models */
scluster_print_header(outfile, argv);
/*--- memory alloc and read data ----------------------------*/
sqd_vec = sequence_d_read(seq_file, &sqd_number);
if (!sqd_vec) {mes_proc(); goto STOP;}
sqd = sqd_vec[0];
cl.smo = smodel_read(smo_file, &cl.smo_number);
if (!cl.smo) {mes_proc(); goto STOP;}
/* if labels == 0 || labels == 1: no startlabels needed. */
/* if `labels` = 3: random start labels */
if (labels == 3) {
fprintf(outfile, "(random start partition)n");
fprintf(stdout, "(Random start partition...)\n");
scluster_random_labels(sqd, cl.smo_number);
}
if(!m_calloc(oldlabel, sqd->seq_number)) {mes_proc(); goto STOP;}
for (i = 0; i < sqd->seq_number; i++)
oldlabel[i] = (-1);
if(!m_calloc(cl.smo_seq, cl.smo_number)) {mes_proc(); goto STOP;}
if(!m_calloc(cl.seq_counter, cl.smo_number)) {mes_proc(); goto STOP;}
if(!m_calloc(cl.smo_Z_MD, cl.smo_number)) {mes_proc(); goto STOP;}
if(!m_calloc(cl.smo_Z_MAW, cl.smo_number)) {mes_proc(); goto STOP;}
all_log_p = matrix_d_alloc(cl.smo_number, (int) sqd->seq_number);
if (!all_log_p) {mes_proc(); goto STOP;}
/*if (smodel_check_compatibility(cl.smo, cl.smo_number)) {
mes_proc(); goto STOP;
}*/
if(!m_calloc(smo_changed, cl.smo_number)) {mes_proc(); goto STOP;}
for (i = 0; i < cl.smo_number; i++) {
cl.smo_seq[i] = NULL;
smo_changed[i] = 1;
}
/* uninformative model prior */
if (CLASSIFY == 1)
for (i = 0; i < cl.smo_number; i++)
cl.smo[i]->prior = 1/(double) cl.smo_number;
/*--------for parallel mode --------------------*/
#if POUT == 0
/* id for threads */
if(!m_calloc(tid, cl.smo_number)) {mes_proc(); goto STOP;}
#endif /* POUT */
/* data structure for threads */
if(!m_calloc(cs, cl.smo_number)) {mes_proc(); goto STOP;}
for (i = 0; i < cl.smo_number; i++)
cs[i].smo = cl.smo[i];
/* return values for each thread */
#if POUT == 0
if(!m_calloc(return_value, cl.smo_number)) {mes_proc(); goto STOP;}
#endif /* POUT */
#ifdef HAVE_LIBPTHREAD
pthread_attr_init(&Attribute);
pthread_attr_setscope(&Attribute, PTHREAD_SCOPE_SYSTEM);
#endif /* HAVE_LIBPTHREAD */
/* eps_bw: stopping criterion in baum-welch
max_iter_bw: max. number of baum-welch iterations */
eps_bw = 0.0; /* not used */
q = 0.1; /* ??? */
max_iter_bw = 20; /*MAX_ITER_BW; */
/*------------------------main loop-------------------------------------------*/
/* do it until no sequence changes model;
attention: error function values are not used directly as a stopping criterion */
while(changes > 0 || eps_bw > EPS_ITER_BW || max_iter_bw < MAX_ITER_BW) {
iter++;
/* reset error functions and counters */
for (i = 0; i < cl.smo_number; i++) {
cl.seq_counter[i] = 0;
cl.smo_Z_MD[i] = 0.0;
cl.smo_Z_MAW[i] = 0.0;
}
/* set pointer of cs to sqd-field and matrix of all_logp values */
for (i = 0; i < cl.smo_number; i++) {
cs[i].sqd = sqd; /* all seqs. ! */
cs[i].logp = all_log_p[i];
}
/* ------------calculate logp for all seqs. and all models --------------*/
#if POUT
/* sequential version */
for (i = 0; i < cl.smo_number; i++) {
if (!smo_changed[i]) continue;
scluster_prob(&cs[i]);
}
#else
/* parallel version */
i = 0;
while (i < cl.smo_number) {
for (j = i; j < m_min(cl.smo_number,i+THREADS); j++) {
if (!smo_changed[j]) continue;
if ((perror = pthread_create(&tid[j],
&Attribute,
(void*(*)(void*))scluster_prob,
(void*)&cs[j]))){
str = mprintf(NULL,0,
"pthread_create returns false (step %d, smo[%d])\n",iter,j);
mes_prot(str); m_free(str); goto STOP;
}
}
for (j = i; j < m_min(cl.smo_number,i+THREADS); j++) {
if (smo_changed[j]) pthread_join(tid[j], NULL);
}
i = j;
}
#endif
if (iter == 1 && labels > 1) {
/* use sequence labels as start labels */
fprintf(outfile, "(sequences with start labels!)\n");
fprintf(stdout, "(sequences with start labels!)\n");
}
/*----------- classification, sequence labeling and error function -----------*/
if (iter == 1 && labels == 1) {
for (i = 0; i < cl.smo_number; i++) {
cl.seq_counter[i] = sqd->seq_number;
}
}
else {
for (j = 0; j < sqd->seq_number; j++) {
if (iter > 1 || labels == 0)
/* classification: set seq_label to ID of best_model */
sqd->seq_label[j] = scluster_best_model(&cl, j, all_log_p, &log_p);
if (sqd->seq_label[j] == -1 || sqd->seq_label[j] >= cl.smo_number) {
/* no model fits! What to do? hack: use arbitrary model ! */
str = mprintf(NULL, 0, "Warning: seq. %ld, ID %.0f: scluster_best_model returns %d\n",
j, sqd->seq_id[j], sqd->seq_label[j]);
mes_prot(str); m_free(str);
sqd->seq_label[j] = j % cl.smo_number;
/* goto STOP; */
}
cl.seq_counter[sqd->seq_label[j]]++;
/* add to error function value with seq. weights */
/* 1. Z_MD */
cl.smo_Z_MD[sqd->seq_label[j]] += sqd->seq_w[j] * all_log_p[sqd->seq_label[j]][j];
/* 2. Z_MAW */
if (CLASSIFY == 1) {
idummy = scluster_log_aposteriori(&cl, sqd, j, &log_apo);
if (idummy == -1) {
str = mprintf(NULL, 0 , "Warn: no model fits to Seq %10.0f, use PENALTY_LOGP\n", sqd->seq_id[j]);
mes_prot(str); m_free(str);
cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * PENALTY_LOGP;
continue;
}
if (idummy != sqd->seq_label[j]) {
printf("Warn: best_model = %ld; best_bayes(logapo) = %d\n",
sqd->seq_label[j], idummy);
}
cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * log_apo;
}
} /* for (j = 0; j < sqd->seq_number; j++) */
} /* else */
if (!(iter == 1 && labels == 1)) {
if (scluster_avoid_empty_smodel(sqd, &cl)== -1) {mes_proc(); goto STOP;}
for (i = 0; i < cl.smo_number; i++) smo_changed[i] = 0;
changes = scluster_update_label(oldlabel, sqd->seq_label, sqd->seq_number,
smo_changed);
}
fprintf(outfile, "%ld changes in iteration %d \n", changes, iter);
fprintf(stdout, "\n*** %ld changes in iteration %d ***\n", changes, iter);
/* NEW: If no changes anymore: increase max_iter
( Alternative: change eps ) */
if (changes == 0 && max_iter_bw < MAX_ITER_BW) {
max_iter_bw = MAX_ITER_BW;
changes = 1; /* so that reestimated and assigned again */
for (i = 0; i < cl.smo_number; i++) smo_changed[i] = 1;
fprintf(outfile, "max_iter_bw := %d\n", max_iter_bw);
fprintf(stdout, "*** max_iter_bw := %d ***\n", max_iter_bw);
}
/* -------Reestimate all models with assigned sequences---------------- */
if (changes > 0) {
if (!(iter == 1 && labels == 1))
if (scluster_update(&cl, sqd)) { mes_proc(); goto STOP; }
/* TEST: Write out k-Means-Start-Clusterung */
if (iter == 1 && labels == 2) {
for (i = 0; i < cl.smo_number; i++) {
if (cl.smo_seq[i] != NULL)
sequence_d_print(stdout, cl.smo_seq[i], 0);
}
}
/*if (labels == -1) {
fprintf(outfile, "(only determining the best model for each seq.!)\n");
fprintf(stdout, "(only determining the best model for each seq.!)\n");
break;
}
*/
fprintf(outfile, "\ntotal Prob. before %d.reestimate:\n", iter);
scluster_print_likelihood(outfile, &cl);
for (i = 0; i < cl.smo_number; i++) {
cs[i].smo = cl.smo[i];
if (!(iter == 1 && labels == 1))
cs[i].sqd = cl.smo_seq[i];
cs[i].logp = &cl.smo_Z_MD[i];
cs[i].eps = eps_bw;
cs[i].max_iter = max_iter_bw;
}
#if POUT
/* sequential version */
for (i = 0; i < cl.smo_number; i++) {
printf("SMO %d\n", i);
if (!smo_changed[i]) continue;
if (cs[i].sqd == NULL)
mes(MES_WIN, "cluster %d empty, no reestimate!\n", i);
else if (sreestimate_baum_welch(&cs[i]) == -1) {
str = mprintf(NULL,0,"%d.reestimate false, smo[%d]\n", iter, i);
mes_prot(str); m_free(str);
/* model_print(stdout, cl.mo[i]); */
goto STOP;
}
}
#else
/* parallel version */
i = 0;
while (i < cl.smo_number) {
for (j = i; j < m_min(cl.smo_number, i+THREADS); j++) {
if (!smo_changed[j]) continue;
if (cs[j].sqd == NULL)
mes(MES_WIN, "cluster %d empty, no reestimate!\n", j);
else if ((perror = pthread_create(&tid[j],
&Attribute,
(void*(*)(void*))sreestimate_baum_welch,
(void*)&cs[j]))) {
str = mprintf(NULL,0,
"pthread_create returns false (step %d, smo[%d])\n",
iter, j);
mes_prot(str); m_free(str);
goto STOP;
}
}
for (j = i; j < m_min(cl.smo_number, i+THREADS); j++) {
if (!smo_changed[j]) continue;
if (cs[j].sqd != NULL) {
pthread_join(tid[j], (void**) &return_value[j]);
if (return_value[j] == -1) {
str = mprintf(NULL,0,"%d.reestimate false, smo[%d]\n", iter, j);
mes_prot(str); m_free(str);
goto STOP;
}
}
}
i = j;
}
#endif
/* update model priors */
if (CLASSIFY == 1) {
if (sqd->total_w == 0) {
mes_prot("weights = 0!\n"); goto STOP;
}
for (i = 0; i < cl.smo_number; i++)
cl.smo[i]->prior = cl.smo_seq[i]->total_w / sqd->total_w;
}
fprintf(outfile, "\ntotal Prob. after %d.reestimate:\n", iter);
scluster_print_likelihood(outfile, &cl);
} /* if changes ... end reestimate */
} /* while */
/*------------------- OUTPUT -----------------------------------------------*/
if (scluster_out(&cl, sqd, outfile, argv) == -1)
{ mes_proc(); goto STOP; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -