scluster.c

来自「General Hidden Markov Model Library 一个通用」· C语言 代码 · 共 935 行 · 第 1/2 页

C
935
字号
/*********************************************************************************       This file is part of the General Hidden Markov Model Library,*       GHMM version 0.8_beta1, see http://ghmm.org**       Filename: ghmm/ghmm/scluster.c*       Authors:  Bernhard Knab, Benjamin Georgi**       Copyright (C) 1998-2004 Alexander Schliep*       Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln*       Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,*                               Berlin**       Contact: schliep@ghmm.org**       This library is free software; you can redistribute it and/or*       modify it under the terms of the GNU Library General Public*       License as published by the Free Software Foundation; either*       version 2 of the License, or (at your option) any later version.**       This library is distributed in the hope that it will be useful,*       but WITHOUT ANY WARRANTY; without even the implied warranty of*       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU*       Library General Public License for more details.**       You should have received a copy of the GNU Library General Public*       License along with this library; if not, write to the Free*       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA***       This file is version $Revision: 1713 $*                       from $Date: 2006-10-16 16:06:28 +0200 (Mon, 16 Oct 2006) $*             last change by $Author: grunau $.********************************************************************************/ #ifdef WIN32#  include "win_config.h"#endif  #ifdef HAVE_CONFIG_H#  include "../config.h"#endif #ifdef GHMM_OBSOLETE  #include <stdio.h>#undef HAVE_LIBPTHREAD#ifdef HAVE_LIBPTHREAD# include <pthread.h>#endif  /* HAVE_LIBPTHREAD */#include <float.h>#include <math.h>#include <time.h>#include "ghmm.h"#include "mprintf.h"#include "mes.h"#include "scluster.h"#include "smodel.h"#include "sreestimate.h"#include "sfoba.h"#include "rng.h"#include "sequence.h"#include "matrix.h"#include "vector.h"#include "ghmm_internals.h"#include "obsolete.h"#include "smap_classify.h"  #ifdef HAVE_LIBPTHREAD/* switch for parallel mode: (1) = sequential, (0) = parallel */ # define POUT 0/* number of parallel threads */ # define THREADS 4#else   /*  */# define POUT 1#endif  /* HAVE_LIBPTHREAD */#define POUT 1  /*============================================================================*/ int ghmm_scluster_hmm (char *argv[]){  # define CUR_PROC "ghmm_scluster_hmm"  char *seq_file = argv[1], *smo_file = argv[2], *out_filename = argv[3];  int labels = atoi (argv[4]);  int res = -1, i, iter = 0, sqd_number, idummy;  ghmm_cseq * sqd = NULL;  ghmm_cseq ** sqd_vec = NULL;      /* only temp. pointer */  long j, changes = 1;  long *oldlabel, *smo_changed;  double log_p, log_apo;  double **all_log_p = NULL;   /* for storing all log_p */  FILE * outfile = NULL;  char *str;  char filename[1024];  scluster_t cl;  double eps_bw, q;  int max_iter_bw;      /* ghmm_cmodel_baum_welch needs this structure (introduced for parallel mode) */     ghmm_cmodel_baum_welch_context * cs;  #if POUT == 0  int *return_value;  pthread_t * tid;  int perror;  pthread_attr_t Attribute;  #endif  /* POUT */    cl.smo = NULL;  cl.smo_seq = NULL;  cl.seq_counter = NULL;  cl.smo_Z_MD = NULL;  cl.smo_Z_MAW = NULL;      /* -----------------initialization ------------------------- */     fprintf (stdout, "Clustering seqs. \"%s\"\nwith ", seq_file);  if (CLASSIFY == 0)    fprintf (stdout, "MD-Classification\n");    else    fprintf (stdout, "MAW-Classification\n");  fflush (stdout);    /*-----------open output file----------------------------------*/     sprintf (filename, "%s%s", out_filename, ".cl");  if (!(outfile = ighmm_mes_fopen (filename, "wt"))) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  /*--------------Memory allocation and data reading----------------------------*/         /* 1 sequence array and initial models */     ghmm_scluster_print_header (outfile, argv);    /*--- memory alloc and read data ----------------------------*/     sqd_vec = ghmm_cseq_read (seq_file, &sqd_number);  if (!sqd_vec) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  sqd = sqd_vec[0];  cl.smo = ghmm_cmodel_read (smo_file, &cl.smo_number);  if (!cl.smo) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }      /* if labels == 0 || labels == 1: no startlabels needed. */     /* if `labels` =  3: random start labels */     if (labels == 3) {    fprintf (outfile, "(random start partition)n");    fprintf (stdout, "(Random start partition...)\n");    ghmm_scluster_random_labels (sqd, cl.smo_number);  }  ARRAY_CALLOC (oldlabel, sqd->seq_number);  for (i = 0; i < sqd->seq_number; i++)    oldlabel[i] = (-1);  ARRAY_CALLOC (cl.smo_seq, cl.smo_number);  ARRAY_CALLOC (cl.seq_counter, cl.smo_number);  ARRAY_CALLOC (cl.smo_Z_MD, cl.smo_number);  ARRAY_CALLOC (cl.smo_Z_MAW, cl.smo_number);  all_log_p = ighmm_cmatrix_alloc (cl.smo_number, (int) sqd->seq_number);  if (!all_log_p) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }      /*if (ghmm_cmodel_check_compatibility(cl.smo, cl.smo_number)) {        GHMM_LOG_QUEUED(LCONVERTED); goto STOP;        } */     ARRAY_CALLOC (smo_changed, cl.smo_number);  for (i = 0; i < cl.smo_number; i++) {    cl.smo_seq[i] = NULL;    smo_changed[i] = 1;  }      /* uninformative model prior */     if (CLASSIFY == 1)    for (i = 0; i < cl.smo_number; i++)      cl.smo[i]->prior = 1 / (double) cl.smo_number;    /*--------for parallel mode  --------------------*/ #if POUT == 0    /* id for threads */     ARRAY_CALLOC (tid, cl.smo_number);  #endif  /* POUT */    /* data structure for  threads */     ARRAY_CALLOC (cs, cl.smo_number);  for (i = 0; i < cl.smo_number; i++)    cs[i].smo = cl.smo[i];      /* return values for each thread */ #if POUT == 0    ARRAY_CALLOC (return_value, cl.smo_number);  #endif  /* POUT */#ifdef HAVE_LIBPTHREAD    pthread_attr_init (&Attribute);  pthread_attr_setscope (&Attribute, PTHREAD_SCOPE_SYSTEM);  #endif  /* HAVE_LIBPTHREAD */        /* eps_bw: stopping criterion in baum-welch       max_iter_bw: max. number of baum-welch iterations */     eps_bw = 0.0;               /* not used */  q = 0.1;                     /* ??? */  max_iter_bw = 20;            /*MAX_ITER_BW; */  /*------------------------main loop-------------------------------------------*/     /* do it until no sequence changes model;        attention: error function values are not used directly as a stopping criterion */     while (changes > 0 || eps_bw > GHMM_EPS_ITER_BW || max_iter_bw < GHMM_MAX_ITER_BW) {    iter++;          /* reset error functions and counters */       for (i = 0; i < cl.smo_number; i++) {      cl.seq_counter[i] = 0;      cl.smo_Z_MD[i] = 0.0;      cl.smo_Z_MAW[i] = 0.0;    }          /* set pointer of cs to sqd-field and matrix of all_logp values */       for (i = 0; i < cl.smo_number; i++) {      cs[i].sqd = sqd;         /* all seqs. ! */      cs[i].logp = all_log_p[i];    }          /* ------------calculate logp for all seqs. and all models -------------- */ #if POUT      /* sequential version */       for (i = 0; i < cl.smo_number; i++) {      if (!smo_changed[i])        continue;      ghmm_scluster_prob (&cs[i]);    }    #else   /*  */      /* parallel version */       i = 0;    while (i < cl.smo_number) {      for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) {        if (!smo_changed[j])          continue;        if ((perror = pthread_create (&tid[j], &Attribute,                                        (void *(*)(void *)) scluster_prob,                                        (void *) &cs[j]))) {          str =            ighmm_mprintf (NULL, 0,                     "pthread_create returns false (step %d, smo[%d])\n",                     iter, j);          GHMM_LOG(LCONVERTED, str);          m_free (str);          goto STOP;        }      }      for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) {        if (smo_changed[j])          pthread_join (tid[j], NULL);      }      i = j;    }    #endif  /*  */      if (iter == 1 && labels > 1) {              /* use sequence labels as start labels */         fprintf (outfile, "(sequences with start labels!)\n");      fprintf (stdout, "(sequences with start labels!)\n");    }        /*----------- classification,  sequence labeling and error function -----------*/       if (iter == 1 && labels == 1) {      for (i = 0; i < cl.smo_number; i++) {        cl.seq_counter[i] = sqd->seq_number;      }    }        else {      for (j = 0; j < sqd->seq_number; j++) {        if (iter > 1 || labels == 0)                      /* classification: set seq_label to ID of best_model  */             sqd->seq_label[j] =            ghmm_scluster_best_model (&cl, j, all_log_p, &log_p);        if (sqd->seq_label[j] == -1 || sqd->seq_label[j] >= cl.smo_number) {                      /* no model fits! What to do?  hack: use arbitrary model ! */             str = ighmm_mprintf (NULL, 0,                    "Warning: seq. %ld, ID %.0f: ghmm_scluster_best_model returns %d\n",                    j, sqd->seq_id[j], sqd->seq_label[j]);          GHMM_LOG(LCONVERTED, str);          m_free (str);          sqd->seq_label[j] = j % cl.smo_number;                      /* goto STOP; */         }        cl.seq_counter[sqd->seq_label[j]]++;                  /* add to error function value with seq. weights  */           /* 1. Z_MD */           cl.smo_Z_MD[sqd->seq_label[j]] +=          sqd->seq_w[j] * all_log_p[sqd->seq_label[j]][j];                  /* 2. Z_MAW */           if (CLASSIFY == 1) {          idummy = ghmm_scluster_log_aposteriori (&cl, sqd, j, &log_apo);          if (idummy == -1) {            str = ighmm_mprintf (NULL, 0,                    "Warn: no model fits to Seq %10.0f, use PENALTY_LOGP\n",                    sqd->seq_id[j]);            GHMM_LOG(LCONVERTED, str);            m_free (str);            cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * GHMM_PENALTY_LOGP;            continue;          }          if (idummy != sqd->seq_label[j]) {            printf ("Warn: best_model = %ld; best_bayes(logapo) = %d\n",                     sqd->seq_label[j], idummy);          }          cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * log_apo;        }      }                        /* for (j = 0; j < sqd->seq_number; j++) */    }                          /* else */    if (!(iter == 1 && labels == 1)) {      if (ghmm_scluster_avoid_empty_smodel (sqd, &cl) == -1) {        GHMM_LOG_QUEUED(LCONVERTED);        goto STOP;      }      for (i = 0; i < cl.smo_number; i++)        smo_changed[i] = 0;      changes =        ghmm_scluster_update_label (oldlabel, sqd->seq_label, sqd->seq_number,                               smo_changed);    }    fprintf (outfile, "%ld changes in iteration %d \n", changes, iter);    fprintf (stdout, "\n*** %ld changes in iteration %d ***\n", changes,              iter);          /* NEW: If no changes anymore: increase max_iter         ( Alternative: change eps ) */       if (changes == 0 && max_iter_bw < GHMM_MAX_ITER_BW) {      max_iter_bw = GHMM_MAX_ITER_BW;      changes = 1;             /* so that reestimated and assigned again  */      for (i = 0; i < cl.smo_number; i++)        smo_changed[i] = 1;      fprintf (outfile, "max_iter_bw := %d\n", max_iter_bw);      fprintf (stdout, "*** max_iter_bw := %d ***\n", max_iter_bw);    }          /* -------Reestimate all models with assigned sequences---------------- */       if (changes > 0) {      if (!(iter == 1 && labels == 1))        if (ghmm_scluster_update (&cl, sqd)) {          GHMM_LOG_QUEUED(LCONVERTED);          goto STOP;        }              /* TEST: Write out k-Means-Start-Clusterung */         if (iter == 1 && labels == 2) {        for (i = 0; i < cl.smo_number; i++) {          if (cl.smo_seq[i] != NULL)            ghmm_cseq_print (stdout, cl.smo_seq[i], 0);        }      }              /*if (labels == -1) {           fprintf(outfile, "(only determining the best model for each seq.!)\n");           fprintf(stdout, "(only determining the best model for each seq.!)\n");           break;           }         */         fprintf (outfile, "\ntotal Prob. before %d.reestimate:\n", iter);      ghmm_scluster_print_likelihood (outfile, &cl);      for (i = 0; i < cl.smo_number; i++) {        cs[i].smo = cl.smo[i];        if (!(iter == 1 && labels == 1))          cs[i].sqd = cl.smo_seq[i];        cs[i].logp = &cl.smo_Z_MD[i];        cs[i].eps = eps_bw;        cs[i].max_iter = max_iter_bw;      }      #if POUT        /* sequential version */         for (i = 0; i < cl.smo_number; i++) {        printf ("SMO %d\n", i);        if (!smo_changed[i])          continue;        if (cs[i].sqd == NULL)          ighmm_mes (MES_WIN, "cluster %d empty, no reestimate!\n", i);                else if (ghmm_cmodel_baum_welch (&cs[i]) == -1) {          str = ighmm_mprintf (NULL, 0, "%d.reestimate false, smo[%d]\n", iter, i);          GHMM_LOG(LCONVERTED, str);          m_free (str);                      /* ghmm_dmodel_print(stdout, cl.mo[i]); */             goto STOP;        }      }      #else   /*  */        /* parallel version */         i = 0;      while (i < cl.smo_number) {        for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) {          if (!smo_changed[j])            continue;          if (cs[j].sqd == NULL)            ighmm_mes (MES_WIN, "cluster %d empty, no reestimate!\n", j);                    else if ((perror = pthread_create (&tid[j], &Attribute,                                              (void *(*)(void *))                                             sreestimate_baum_welch,                                             (void *) &cs[j]))) {            str =              ighmm_mprintf (NULL, 0,                       "pthread_create returns false (step %d, smo[%d])\n",                       iter, j);            GHMM_LOG(LCONVERTED, str);            m_free (str);            goto STOP;          }        }        for (j = i; j < m_min (cl.smo_number, i + THREADS); j++) {          if (!smo_changed[j])            continue;          if (cs[j].sqd != NULL) {            pthread_join (tid[j], (void **) &return_value[j]);            if (return_value[j] == -1) {              str =                ighmm_mprintf (NULL, 0, "%d.reestimate false, smo[%d]\n", iter, j);              GHMM_LOG(LCONVERTED, str);              m_free (str);              goto STOP;            }          }        }        i = j;      }      #endif  /*  */                /* update model priors */         if (CLASSIFY == 1) {        if (sqd->total_w == 0) {          GHMM_LOG(LCONVERTED, "weights = 0!\n");          goto STOP;        }        for (i = 0; i < cl.smo_number; i++)          cl.smo[i]->prior = cl.smo_seq[i]->total_w / sqd->total_w;      }      fprintf (outfile, "\ntotal Prob. after %d.reestimate:\n", iter);      ghmm_scluster_print_likelihood (outfile, &cl);    }                          /* if changes ... end reestimate */  }                            /* while */  /*------------------- OUTPUT   -----------------------------------------------*/     if (ghmm_scluster_out (&cl, sqd, outfile, argv) == -1)     {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?