⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cluster.c

📁 一个通用的隐性马尔可夫C代码库 开发环境:C语言 简要说明:这是一个通用的隐性马尔可夫C代码库
💻 C
字号:
/*******************************************************************************  author       : Bernd Wichern  filename     : ghmm/ghmm/cluster.c  created      : TIME: 10:55:33     DATE: Tue 02. June 1998  $Id: cluster.c,v 1.8 2002/02/24 10:52:01 achim Exp $Copyright (C) 1998-2001, ZAIK/ZPR, Universit鋞 zu K鰈nThis program is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 2 of the License, or(at your option) any later version.This program is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with this program; if not, write to the Free SoftwareFoundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*******************************************************************************/#include <stdio.h>#include "mprintf.h"#include "mes.h"#include "cluster.h"#include "rng.h"#include "model.h"#include "sequence.h"#include "reestimate.h"#include "foba.h"#include "const.h"#include "matrix.h"/*============================================================================*/int cluster_hmm(char *seq_file, char *mo_file, char *out_filename)  {# define CUR_PROC "cluster_hmm"  int res = -1, i, iter = 0, sq_number;  sequence_t *sq = NULL, **sq_vec = NULL;  long j, changes = 1;   long *oldlabel;  double log_p;  FILE *outfile = NULL;  cluster_t cl;  cl.mo = NULL;  cl.mo_seq = NULL;    if(!(outfile = mes_fopen(out_filename, "wt"))) {mes_proc(); goto STOP;}/*----------------------------------------------------------------------------*/  /* Allocate memory and read in data: Sequences and initial model */  sq_vec = sequence_read(seq_file, &sq_number);  if (!sq_vec[0]) {mes_proc(); goto STOP;}  if (sq_number > 1)     mes_prot("Warning: seq. file contains multiple seq. arrays. \                      Only first array is used for clustering\n");  sq = sq_vec[0];  fprintf(outfile, "Cluster Sequences\n");  sequence_print(outfile, sq);  cl.mo = model_read(mo_file, &cl.mo_number);  if (!cl.mo) {mes_proc(); goto STOP;}   if(!m_calloc(oldlabel, sq->seq_number)) {mes_proc(); goto STOP;}  for (i = 0; i < sq->seq_number; i++)    oldlabel[i] = (-1);  if(!m_calloc(cl.mo_seq, cl.mo_number)) {mes_proc(); goto STOP;}  for (i = 0; i < cl.mo_number; i++)    cl.mo_seq[i] = NULL;  if (model_check_compatibility(cl.mo, cl.mo_number)) {     mes_proc(); goto STOP;   } /*----------------------------------------------------------------------------*/  fprintf(outfile, "\nInitial Models:\n");  for (i = 0; i < cl.mo_number; i++)    model_print(outfile, cl.mo[i]);/*----------------------------------------------------------------------------*/  while(changes > 0) {     iter++;    /* Associate the sequences */    fprintf(outfile, "\nSequence, Best Model, logP of generating Seq.:\n");    for (j = 0; j < sq->seq_number; j++) {      sq->seq_label[j] = 	sequence_best_model(cl.mo, cl.mo_number, sq->seq[j], sq->seq_len[j],			    &log_p);          fprintf(outfile, "seq %ld, mo %ld, log p %.4f\n", j, 	      sq->seq_label[j], log_p);      if (sq->seq_label[j] == -1 || sq->seq_label[j] >= cl.mo_number) { 	/* No model fits! */ 	char *str =	  mprintf(NULL, 0, "Seq. %ld: sequence_best_model gives %d\n",		  j, sq->seq_label[j]); 	mes_prot(str); m_free(str); goto STOP;      }    }    if (cluster_avoid_empty_model(sq->seq_label, sq->seq_number, cl.mo_number))      { mes_proc(); goto STOP; }    changes = cluster_update_label(oldlabel, sq->seq_label, sq->seq_number);    fprintf(outfile, "%ld changes\n", changes);    fprintf(stdout, "\n*** %ld changes in iteration %d ***\n\n", changes, iter);    /* Reestimate models with the associated sequences */    if (changes > 0) {      if (cluster_update(&cl, sq)) {	mes_proc(); goto STOP;       }      fprintf(outfile, "\nGes. WS VOR %d.Reestimate:\n", iter);      cluster_print_likelihood(outfile, &cl);      for (i = 0; i < cl.mo_number; i++) {	if (reestimate_baum_welch(cl.mo[i], cl.mo_seq[i])) {	  char *str =  	    mprintf(NULL,0,"%d.reestimate false, mo[%d]\n", iter, i); 	  mes_prot(str);	m_free(str);	  /* model_print(stdout, cl.mo[i]); */	  goto STOP; 	}      }      fprintf(outfile, "\nGes. WS NACH %d.Reestimate:\n", iter);      cluster_print_likelihood(outfile, &cl);    } /* if changes */  } /* while *//*----------------------------------------------------------------------------*/  if (!cluster_out(&cl, sq, outfile, out_filename)) {    mes_proc(); goto STOP;  }        res = 0;STOP:  /* ...noch div. free! */  if (outfile) fclose(outfile);  return(res);# undef CUR_PROC}/* cluster_hmm *//*============================================================================*/int cluster_update(cluster_t *cl, sequence_t *sq) {#define CUR_PROC "cluster_update"  int i, res = -1;  long *seq_counter;  sequence_t *seq_t;  if(!m_calloc(seq_counter, cl->mo_number)) {mes_proc(); goto STOP;}  /* Fix the number of associated sequences */  for (i = 0; i < sq->seq_number; i++)     seq_counter[sq->seq_label[i]]++;  /* allocate memory block for block */  for (i = 0; i < cl->mo_number; i++) {    if (cl->mo_seq[i]) {      /* Important: No sequence_free here, otherwise are the original 	 sequences gone! */      sequence_clean(cl->mo_seq[i]);      m_free(cl->mo_seq[i]);    }    cl->mo_seq[i] = sequence_calloc(seq_counter[i]);    cl->mo_seq[i]->seq_number = 0; /* Counted later */  }  /* Set entries */  for (i = 0; i < sq->seq_number; i++) {    seq_t = cl->mo_seq[sq->seq_label[i]];    seq_t->seq_len[seq_t->seq_number] = sq->seq_len[i];    seq_t->seq[seq_t->seq_number] = sq->seq[i]; /* Pointer!!! */    seq_t->seq_label[seq_t->seq_number] = sq->seq_label[i];    seq_t->seq_number++;  }  res = 0; STOP:  m_free(seq_counter);  return(res);# undef CUR_PROC} /* cluster_update *//*============================================================================*/void cluster_print_likelihood(FILE *outfile, cluster_t *cl) {  double ges_prob = 0.0, mo_prob;  int i;  for (i = 0; i < cl->mo_number; i++) {    mo_prob = model_likelihood(cl->mo[i], cl->mo_seq[i]);     ges_prob += mo_prob;    fprintf(outfile, "mo %d (#Seq. %ld): %.4f\n", i, 	    cl->mo_seq[i]->seq_number, mo_prob);  }  fprintf(outfile, "Summe: %.4f\n\n", ges_prob);} /* cluster_print_likelihood *//*============================================================================*//* Prevents that empty models are sent out (no associated seqences) by    associating a random sequence. Since it's possible to produce an empty model   in this way, we swap the sequences until a nonempty model is produced. (This    could be a never-ending process and therefore it's only done 100 times). */int cluster_avoid_empty_model(long *seq_label, long seq_number, 			       int mo_number) {#define CUR_PROC "cluster_avoid_empty_model"  int i;  long j = 0;  long *counter;  char error = 1, change = 0;  int iter = 0;  /* Initialization */  if(!m_calloc(counter, mo_number)) {    mes_proc(); return (-1);  }  for (i = 0; i < mo_number; i++)    counter[i] = 0;  for (i = 0; i < seq_number; i++)     counter[seq_label[i]]++;  while (error && iter < 100) {    iter++;    error = 0;    /* Do we have empty models ? */    for (i = 0; i < mo_number; i++) {      if (counter[i] == 0) {	change = 1;	/* Choose a random sequence for an empty model */	j = m_int(gsl_rng_uniform(RNG) * (seq_number - 1));	/* The orginal model loses one sequence */	printf("!!\"avoid_empty_model\":Verschiebe Seq. %ld: %ld --> %d !!\n", 	       j, seq_label[j], i);	counter[seq_label[j]] --;	counter[i] = 1;	seq_label[j] = i;	      }      }    /* Is now everything OK ? */    if (change) {      for (i = 0; i < mo_number; i++) {	if (counter[i] <= 0) {	  error = 1;	  break;	}      }    }  } /* while */  if (error) return (-1);  else return 0;#undef CUR_PROC} /* cluster_avoid_empty_model *//*============================================================================*/int cluster_out(cluster_t *cl, sequence_t *sq, FILE *outfile,		     char *out_filename) {#define CUR_PROC "cluster_out"  int res = -1;  int i;  sequence_d_t *sqd = NULL;  fprintf(outfile, "\nFinal Models:\n");  for (i = 0; i < cl->mo_number; i++)    model_print(outfile, cl->mo[i]);  res = 0;  sequence_d_free(&sqd);  return(res);#undef CUR_PROC} /* cluster_out *//*============================================================================*/long cluster_update_label(long *oldlabel, long *seq_label, 			  long seq_number) {  long i, changes = 0;   for (i = 0; i < seq_number; i++)     if (oldlabel[i] != seq_label[i]) {      changes++;      oldlabel[i] = seq_label[i];    }  return changes;} /* cluster_update_label */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -