scluster.c
来自「General Hidden Markov Model Library 一个通用」· C语言 代码 · 共 935 行 · 第 1/2 页
C
935 行
/********************************************************************************* This file is part of the General Hidden Markov Model Library,* GHMM version 0.8_beta1, see http://ghmm.org** Filename: ghmm/ghmm/scluster.c* Authors: Bernhard Knab, Benjamin Georgi** Copyright (C) 1998-2004 Alexander Schliep* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,* Berlin** Contact: schliep@ghmm.org** This library is free software; you can redistribute it and/or* modify it under the terms of the GNU Library General Public* License as published by the Free Software Foundation; either* version 2 of the License, or (at your option) any later version.** This library is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU* Library General Public License for more details.** You should have received a copy of the GNU Library General Public* License along with this library; if not, write to the Free* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*** This file is version $Revision: 1713 $* from $Date: 2006-10-16 16:06:28 +0200 (Mon, 16 Oct 2006) $* last change by $Author: grunau $.********************************************************************************/ #ifdef WIN32# include "win_config.h"#endif #ifdef HAVE_CONFIG_H# include "../config.h"#endif #ifdef GHMM_OBSOLETE #include <stdio.h>#undef HAVE_LIBPTHREAD#ifdef HAVE_LIBPTHREAD# include <pthread.h>#endif /* HAVE_LIBPTHREAD */#include <float.h>#include <math.h>#include <time.h>#include "ghmm.h"#include "mprintf.h"#include "mes.h"#include "scluster.h"#include "smodel.h"#include "sreestimate.h"#include "sfoba.h"#include "rng.h"#include "sequence.h"#include "matrix.h"#include "vector.h"#include "ghmm_internals.h"#include "obsolete.h"#include "smap_classify.h" #ifdef HAVE_LIBPTHREAD/* switch for parallel mode: (1) = sequential, (0) = parallel */ # define POUT 0/* number of parallel threads */ # define THREADS 4#else /* */# define POUT 1#endif /* HAVE_LIBPTHREAD */#define POUT 1 /*============================================================================*/ int ghmm_scluster_hmm (char *argv[]){ # define CUR_PROC "ghmm_scluster_hmm" char *seq_file = argv[1], *smo_file = argv[2], *out_filename = argv[3]; int labels = atoi (argv[4]); int res = -1, i, iter = 0, sqd_number, idummy; ghmm_cseq * sqd = NULL; ghmm_cseq ** sqd_vec = NULL; /* only temp. pointer */ long j, changes = 1; long *oldlabel, *smo_changed; double log_p, log_apo; double **all_log_p = NULL; /* for storing all log_p */ FILE * outfile = NULL; char *str; char filename[1024]; scluster_t cl; double eps_bw, q; int max_iter_bw; /* ghmm_cmodel_baum_welch needs this structure (introduced for parallel mode) */ ghmm_cmodel_baum_welch_context * cs; #if POUT == 0 int *return_value; pthread_t * tid; int perror; pthread_attr_t Attribute; #endif /* POUT */ cl.smo = NULL; cl.smo_seq = NULL; cl.seq_counter = NULL; cl.smo_Z_MD = NULL; cl.smo_Z_MAW = NULL; /* -----------------initialization ------------------------- */ fprintf (stdout, "Clustering seqs. \"%s\"\nwith ", seq_file); if (CLASSIFY == 0) fprintf (stdout, "MD-Classification\n"); else fprintf (stdout, "MAW-Classification\n"); fflush (stdout); /*-----------open output file----------------------------------*/ sprintf (filename, "%s%s", out_filename, ".cl"); if (!(outfile = ighmm_mes_fopen (filename, "wt"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } /*--------------Memory allocation and data reading----------------------------*/ /* 1 sequence array and initial models */ ghmm_scluster_print_header (outfile, argv); /*--- memory alloc and read data ----------------------------*/ sqd_vec = ghmm_cseq_read (seq_file, &sqd_number); if (!sqd_vec) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } sqd = sqd_vec[0]; cl.smo = ghmm_cmodel_read (smo_file, &cl.smo_number); if (!cl.smo) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } /* if labels == 0 || labels == 1: no startlabels needed. */ /* if `labels` = 3: random start labels */ if (labels == 3) { fprintf (outfile, "(random start partition)n"); fprintf (stdout, "(Random start partition...)\n"); ghmm_scluster_random_labels (sqd, cl.smo_number); } ARRAY_CALLOC (oldlabel, sqd->seq_number); for (i = 0; i < sqd->seq_number; i++) oldlabel[i] = (-1); ARRAY_CALLOC (cl.smo_seq, cl.smo_number); ARRAY_CALLOC (cl.seq_counter, cl.smo_number); ARRAY_CALLOC (cl.smo_Z_MD, cl.smo_number); ARRAY_CALLOC (cl.smo_Z_MAW, cl.smo_number); all_log_p = ighmm_cmatrix_alloc (cl.smo_number, (int) sqd->seq_number); if (!all_log_p) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } /*if (ghmm_cmodel_check_compatibility(cl.smo, cl.smo_number)) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } */ ARRAY_CALLOC (smo_changed, cl.smo_number); for (i = 0; i < cl.smo_number; i++) { cl.smo_seq[i] = NULL; smo_changed[i] = 1; } /* uninformative model prior */ if (CLASSIFY == 1) for (i = 0; i < cl.smo_number; i++) cl.smo[i]->prior = 1 / (double) cl.smo_number; /*--------for parallel mode --------------------*/ #if POUT == 0 /* id for threads */ ARRAY_CALLOC (tid, cl.smo_number); #endif /* POUT */ /* data structure for threads */ ARRAY_CALLOC (cs, cl.smo_number); for (i = 0; i < cl.smo_number; i++) cs[i].smo = cl.smo[i]; /* return values for each thread */ #if POUT == 0 ARRAY_CALLOC (return_value, cl.smo_number); #endif /* POUT */#ifdef HAVE_LIBPTHREAD pthread_attr_init (&Attribute); pthread_attr_setscope (&Attribute, PTHREAD_SCOPE_SYSTEM); #endif /* HAVE_LIBPTHREAD */ /* eps_bw: stopping criterion in baum-welch max_iter_bw: max. number of baum-welch iterations */ eps_bw = 0.0; /* not used */ q = 0.1; /* ??? */ max_iter_bw = 20; /*MAX_ITER_BW; */ /*------------------------main loop-------------------------------------------*/ /* do it until no sequence changes model; attention: error function values are not used directly as a stopping criterion */ while (changes > 0 || eps_bw > GHMM_EPS_ITER_BW || max_iter_bw < GHMM_MAX_ITER_BW) { iter++; /* reset error functions and counters */ for (i = 0; i < cl.smo_number; i++) { cl.seq_counter[i] = 0; cl.smo_Z_MD[i] = 0.0; cl.smo_Z_MAW[i] = 0.0; } /* set pointer of cs to sqd-field and matrix of all_logp values */ for (i = 0; i < cl.smo_number; i++) { cs[i].sqd = sqd; /* all seqs. ! */ cs[i].logp = all_log_p[i]; } /* ------------calculate logp for all seqs. and all models -------------- */ #if POUT /* sequential version */ for (i = 0; i < cl.smo_number; i++) { if (!smo_changed[i]) continue; ghmm_scluster_prob (&cs[i]); } #else /* */ /* parallel version */ i = 0; while (i < cl.smo_number) { for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) { if (!smo_changed[j]) continue; if ((perror = pthread_create (&tid[j], &Attribute, (void *(*)(void *)) scluster_prob, (void *) &cs[j]))) { str = ighmm_mprintf (NULL, 0, "pthread_create returns false (step %d, smo[%d])\n", iter, j); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } } for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) { if (smo_changed[j]) pthread_join (tid[j], NULL); } i = j; } #endif /* */ if (iter == 1 && labels > 1) { /* use sequence labels as start labels */ fprintf (outfile, "(sequences with start labels!)\n"); fprintf (stdout, "(sequences with start labels!)\n"); } /*----------- classification, sequence labeling and error function -----------*/ if (iter == 1 && labels == 1) { for (i = 0; i < cl.smo_number; i++) { cl.seq_counter[i] = sqd->seq_number; } } else { for (j = 0; j < sqd->seq_number; j++) { if (iter > 1 || labels == 0) /* classification: set seq_label to ID of best_model */ sqd->seq_label[j] = ghmm_scluster_best_model (&cl, j, all_log_p, &log_p); if (sqd->seq_label[j] == -1 || sqd->seq_label[j] >= cl.smo_number) { /* no model fits! What to do? hack: use arbitrary model ! */ str = ighmm_mprintf (NULL, 0, "Warning: seq. %ld, ID %.0f: ghmm_scluster_best_model returns %d\n", j, sqd->seq_id[j], sqd->seq_label[j]); GHMM_LOG(LCONVERTED, str); m_free (str); sqd->seq_label[j] = j % cl.smo_number; /* goto STOP; */ } cl.seq_counter[sqd->seq_label[j]]++; /* add to error function value with seq. weights */ /* 1. Z_MD */ cl.smo_Z_MD[sqd->seq_label[j]] += sqd->seq_w[j] * all_log_p[sqd->seq_label[j]][j]; /* 2. Z_MAW */ if (CLASSIFY == 1) { idummy = ghmm_scluster_log_aposteriori (&cl, sqd, j, &log_apo); if (idummy == -1) { str = ighmm_mprintf (NULL, 0, "Warn: no model fits to Seq %10.0f, use PENALTY_LOGP\n", sqd->seq_id[j]); GHMM_LOG(LCONVERTED, str); m_free (str); cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * GHMM_PENALTY_LOGP; continue; } if (idummy != sqd->seq_label[j]) { printf ("Warn: best_model = %ld; best_bayes(logapo) = %d\n", sqd->seq_label[j], idummy); } cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * log_apo; } } /* for (j = 0; j < sqd->seq_number; j++) */ } /* else */ if (!(iter == 1 && labels == 1)) { if (ghmm_scluster_avoid_empty_smodel (sqd, &cl) == -1) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } for (i = 0; i < cl.smo_number; i++) smo_changed[i] = 0; changes = ghmm_scluster_update_label (oldlabel, sqd->seq_label, sqd->seq_number, smo_changed); } fprintf (outfile, "%ld changes in iteration %d \n", changes, iter); fprintf (stdout, "\n*** %ld changes in iteration %d ***\n", changes, iter); /* NEW: If no changes anymore: increase max_iter ( Alternative: change eps ) */ if (changes == 0 && max_iter_bw < GHMM_MAX_ITER_BW) { max_iter_bw = GHMM_MAX_ITER_BW; changes = 1; /* so that reestimated and assigned again */ for (i = 0; i < cl.smo_number; i++) smo_changed[i] = 1; fprintf (outfile, "max_iter_bw := %d\n", max_iter_bw); fprintf (stdout, "*** max_iter_bw := %d ***\n", max_iter_bw); } /* -------Reestimate all models with assigned sequences---------------- */ if (changes > 0) { if (!(iter == 1 && labels == 1)) if (ghmm_scluster_update (&cl, sqd)) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } /* TEST: Write out k-Means-Start-Clusterung */ if (iter == 1 && labels == 2) { for (i = 0; i < cl.smo_number; i++) { if (cl.smo_seq[i] != NULL) ghmm_cseq_print (stdout, cl.smo_seq[i], 0); } } /*if (labels == -1) { fprintf(outfile, "(only determining the best model for each seq.!)\n"); fprintf(stdout, "(only determining the best model for each seq.!)\n"); break; } */ fprintf (outfile, "\ntotal Prob. before %d.reestimate:\n", iter); ghmm_scluster_print_likelihood (outfile, &cl); for (i = 0; i < cl.smo_number; i++) { cs[i].smo = cl.smo[i]; if (!(iter == 1 && labels == 1)) cs[i].sqd = cl.smo_seq[i]; cs[i].logp = &cl.smo_Z_MD[i]; cs[i].eps = eps_bw; cs[i].max_iter = max_iter_bw; } #if POUT /* sequential version */ for (i = 0; i < cl.smo_number; i++) { printf ("SMO %d\n", i); if (!smo_changed[i]) continue; if (cs[i].sqd == NULL) ighmm_mes (MES_WIN, "cluster %d empty, no reestimate!\n", i); else if (ghmm_cmodel_baum_welch (&cs[i]) == -1) { str = ighmm_mprintf (NULL, 0, "%d.reestimate false, smo[%d]\n", iter, i); GHMM_LOG(LCONVERTED, str); m_free (str); /* ghmm_dmodel_print(stdout, cl.mo[i]); */ goto STOP; } } #else /* */ /* parallel version */ i = 0; while (i < cl.smo_number) { for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) { if (!smo_changed[j]) continue; if (cs[j].sqd == NULL) ighmm_mes (MES_WIN, "cluster %d empty, no reestimate!\n", j); else if ((perror = pthread_create (&tid[j], &Attribute, (void *(*)(void *)) sreestimate_baum_welch, (void *) &cs[j]))) { str = ighmm_mprintf (NULL, 0, "pthread_create returns false (step %d, smo[%d])\n", iter, j); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } } for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) { if (!smo_changed[j]) continue; if (cs[j].sqd != NULL) { pthread_join (tid[j], (void **) &return_value[j]); if (return_value[j] == -1) { str = ighmm_mprintf (NULL, 0, "%d.reestimate false, smo[%d]\n", iter, j); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } } } i = j; } #endif /* */ /* update model priors */ if (CLASSIFY == 1) { if (sqd->total_w == 0) { GHMM_LOG(LCONVERTED, "weights = 0!\n"); goto STOP; } for (i = 0; i < cl.smo_number; i++) cl.smo[i]->prior = cl.smo_seq[i]->total_w / sqd->total_w; } fprintf (outfile, "\ntotal Prob. after %d.reestimate:\n", iter); ghmm_scluster_print_likelihood (outfile, &cl); } /* if changes ... end reestimate */ } /* while */ /*------------------- OUTPUT -----------------------------------------------*/ if (ghmm_scluster_out (&cl, sqd, outfile, argv) == -1) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?