⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cluster.c

📁 General Hidden Markov Model Library 一个通用的隐马尔科夫模型的C代码库
💻 C
字号:
/*********************************************************************************       This file is part of the General Hidden Markov Model Library,*       GHMM version 0.8_beta1, see http://ghmm.org**       Filename: ghmm/ghmm/cluster.c*       Authors:  Bernd Wichern**       Copyright (C) 1998-2004 Alexander Schliep*       Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln*       Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,*                               Berlin**       Contact: schliep@ghmm.org**       This library is free software; you can redistribute it and/or*       modify it under the terms of the GNU Library General Public*       License as published by the Free Software Foundation; either*       version 2 of the License, or (at your option) any later version.**       This library is distributed in the hope that it will be useful,*       but WITHOUT ANY WARRANTY; without even the implied warranty of*       MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU*       Library General Public License for more details.**       You should have received a copy of the GNU Library General Public*       License along with this library; if not, write to the Free*       Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA***       This file is version $Revision: 1713 $*                       from $Date: 2006-10-16 16:06:28 +0200 (Mon, 16 Oct 2006) $*             last change by $Author: grunau $.********************************************************************************/#ifdef WIN32#  include "win_config.h"#endif#ifdef HAVE_CONFIG_H#  include "../config.h"#endif#ifdef GHMM_OBSOLETE    #include <stdio.h>#include "ghmm.h"#include "mprintf.h"#include "mes.h"#include "cluster.h"#include "rng.h"#include "model.h"#include "sequence.h"#include "reestimate.h"#include "foba.h"#include "matrix.h"#include "ghmm_internals.h"#include "obsolete.h"/*============================================================================*/int ghmm_cluster_hmm (char *seq_file, char *mo_file, char *out_filename){# define CUR_PROC "ghmm_cluster_hmm"  int res = -1, i, iter = 0, sq_number;  ghmm_dseq *sq = NULL, **sq_vec = NULL;  long j, changes = 1;  long *oldlabel;  double log_p;  char * str;  FILE *outfile = NULL;  cluster_t cl;  cl.mo = NULL;  cl.mo_seq = NULL;  if (!(outfile = ighmm_mes_fopen (out_filename, "wt"))) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }/*----------------------------------------------------------------------------*/  /* Allocate memory and read in data: Sequences and initial model */  sq_vec = ghmm_dseq_read (seq_file, &sq_number);  if (!sq_vec[0]) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  if (sq_number > 1)    GHMM_LOG(LCONVERTED, "Warning: seq. file contains multiple seq. arrays. \                      Only first array is used for clustering\n");  sq = sq_vec[0];  fprintf (outfile, "Cluster Sequences\n");  ghmm_dseq_print (outfile, sq);  cl.mo = ghmm_dmodel_read (mo_file, &cl.mo_number);  if (!cl.mo) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  ARRAY_CALLOC (oldlabel, sq->seq_number);  for (i = 0; i < sq->seq_number; i++)    oldlabel[i] = (-1);  ARRAY_CALLOC (cl.mo_seq, cl.mo_number);  for (i = 0; i < cl.mo_number; i++)    cl.mo_seq[i] = NULL;  if (ghmm_dmodel_check_compatibility (cl.mo, cl.mo_number)) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }/*----------------------------------------------------------------------------*/  fprintf (outfile, "\nInitial Models:\n");  for (i = 0; i < cl.mo_number; i++)    ghmm_dmodel_print (outfile, cl.mo[i]);/*----------------------------------------------------------------------------*/  while (changes > 0) {    iter++;    /* Associate the sequences */    fprintf (outfile, "\nSequence, Best Model, logP of generating Seq.:\n");    for (j = 0; j < sq->seq_number; j++) {      sq->seq_label[j] =        ghmm_dseq_best_model (cl.mo, cl.mo_number, sq->seq[j], sq->seq_len[j],                             &log_p);      fprintf (outfile, "seq %ld, mo %ld, log p %.4f\n", j,               sq->seq_label[j], log_p);      if (sq->seq_label[j] == -1 || sq->seq_label[j] >= cl.mo_number) {        /* No model fits! */        str = ighmm_mprintf (NULL, 0, "Seq. %ld: ghmm_dseq_best_model gives %d\n",                   j, sq->seq_label[j]);        GHMM_LOG(LCONVERTED, str);        m_free (str);        goto STOP;      }    }    if (ghmm_cluster_avoid_empty_model        (sq->seq_label, sq->seq_number, cl.mo_number)) {      GHMM_LOG_QUEUED(LCONVERTED);      goto STOP;    }    changes = ghmm_cluster_update_label (oldlabel, sq->seq_label, sq->seq_number);    fprintf (outfile, "%ld changes\n", changes);    fprintf (stdout, "\n*** %ld changes in iteration %d ***\n\n", changes,             iter);    /* Reestimate models with the associated sequences */    if (changes > 0) {      if (ghmm_cluster_update (&cl, sq)) {        GHMM_LOG_QUEUED(LCONVERTED);        goto STOP;      }      fprintf (outfile, "\nGes. WS VOR %d.Reestimate:\n", iter);      ghmm_cluster_print_likelihood (outfile, &cl);      for (i = 0; i < cl.mo_number; i++) {        if (ghmm_dmodel_baum_welch (cl.mo[i], cl.mo_seq[i])) {          str = ighmm_mprintf (NULL, 0, "%d.reestimate false, mo[%d]\n", iter, i);          GHMM_LOG(LCONVERTED, str);          m_free (str);          /* ghmm_dmodel_print(stdout, cl.mo[i]); */          goto STOP;        }      }      fprintf (outfile, "\nGes. WS NACH %d.Reestimate:\n", iter);      ghmm_cluster_print_likelihood (outfile, &cl);    }                           /* if changes */  }                             /* while *//*----------------------------------------------------------------------------*/  if (!ghmm_cluster_out (&cl, sq, outfile, out_filename)) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  res = 0;STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  /* ...noch div. free! */  if (outfile)    fclose (outfile);  return (res);# undef CUR_PROC}                               /* ghmm_cluster_hmm *//*============================================================================*/int ghmm_cluster_update (cluster_t * cl, ghmm_dseq * sq){#define CUR_PROC "ghmm_cluster_update"  int i, res = -1;  long *seq_counter;  ghmm_dseq *seq_t;  ARRAY_CALLOC (seq_counter, cl->mo_number);  /* Fix the number of associated sequences */  for (i = 0; i < sq->seq_number; i++)    seq_counter[sq->seq_label[i]]++;  /* allocate memory block for block */  for (i = 0; i < cl->mo_number; i++) {    if (cl->mo_seq[i]) {      /* Important: No ghmm_dseq_free here, otherwise are the original          sequences gone! */      ghmm_dseq_clean (cl->mo_seq[i]);      m_free (cl->mo_seq[i]);    }    cl->mo_seq[i] = ghmm_dseq_calloc (seq_counter[i]);    cl->mo_seq[i]->seq_number = 0;      /* Counted later */  }  /* Set entries */  for (i = 0; i < sq->seq_number; i++) {    seq_t = cl->mo_seq[sq->seq_label[i]];    seq_t->seq_len[seq_t->seq_number] = sq->seq_len[i];    seq_t->seq[seq_t->seq_number] = sq->seq[i]; /* Pointer!!! */    seq_t->seq_label[seq_t->seq_number] = sq->seq_label[i];    seq_t->seq_number++;  }  res = 0;STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  m_free (seq_counter);  return (res);# undef CUR_PROC}                               /* ghmm_cluster_update *//*============================================================================*/void ghmm_cluster_print_likelihood (FILE * outfile, cluster_t * cl){  double ges_prob = 0.0, mo_prob;  int i;  for (i = 0; i < cl->mo_number; i++) {    mo_prob = ghmm_dmodel_likelihood (cl->mo[i], cl->mo_seq[i]);    ges_prob += mo_prob;    fprintf (outfile, "mo %d (#Seq. %ld): %.4f\n", i,             cl->mo_seq[i]->seq_number, mo_prob);  }  fprintf (outfile, "Summe: %.4f\n\n", ges_prob);}                               /* ghmm_cluster_print_likelihood *//*============================================================================*//* Prevents that empty models are sent out (no associated seqences) by    associating a random sequence. Since it's possible to produce an empty model   in this way, we swap the sequences until a nonempty model is produced. (This    could be a never-ending process and therefore it's only done 100 times). */int ghmm_cluster_avoid_empty_model (long *seq_label, long seq_number,                               int mo_number){#define CUR_PROC "ghmm_cluster_avoid_empty_model"  int i;  long j = 0;  long *counter;  char error = 1, change = 0;  int iter = 0;  /* Initialization */  ARRAY_CALLOC (counter, mo_number);  for (i = 0; i < mo_number; i++)    counter[i] = 0;  for (i = 0; i < seq_number; i++)    counter[seq_label[i]]++;  while (error && iter < 100) {    iter++;    error = 0;    /* Do we have empty models ? */    for (i = 0; i < mo_number; i++) {      if (counter[i] == 0) {        change = 1;        /* Choose a random sequence for an empty model */        j = m_int (GHMM_RNG_UNIFORM (RNG) * (seq_number - 1));        /* The orginal model loses one sequence */        printf          ("!!\"avoid_empty_model\":Verschiebe Seq. %ld: %ld --> %d !!\n", j,           seq_label[j], i);        counter[seq_label[j]]--;        counter[i] = 1;        seq_label[j] = i;      }    }    /* Is now everything OK ? */    if (change) {      for (i = 0; i < mo_number; i++) {        if (counter[i] <= 0) {          error = 1;          break;        }      }    }  }                             /* while */STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  if (error)    return (-1);  else    return 0;#undef CUR_PROC}                               /* ghmm_cluster_avoid_empty_model *//*============================================================================*/int ghmm_cluster_out (cluster_t * cl, ghmm_dseq * sq, FILE * outfile,                 char *out_filename){#define CUR_PROC "ghmm_cluster_out"  int res = -1;  int i;  ghmm_cseq *sqd = NULL;  fprintf (outfile, "\nFinal Models:\n");  for (i = 0; i < cl->mo_number; i++)    ghmm_dmodel_print (outfile, cl->mo[i]);  res = 0;  ghmm_cseq_free (&sqd);  return (res);#undef CUR_PROC}                               /* ghmm_cluster_out *//*============================================================================*/long ghmm_cluster_update_label (long *oldlabel, long *seq_label, long seq_number){  long i, changes = 0;  for (i = 0; i < seq_number; i++)    if (oldlabel[i] != seq_label[i]) {      changes++;      oldlabel[i] = seq_label[i];    }  return changes;}                               /* ghmm_cluster_update_label */#endif /* GHMM_OBSOLETE */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -