scluster.c

来自「一个通用的隐性马尔可夫C代码库开发环境:C语言简要说明:这是一个通用的」· C语言代码 · 共 768 行 · 第 1/2 页
768 行
/*******************************************************************************
  author       : Bernhard Knab
  filename     : ghmm/ghmm/scluster.c
  created      : TIME: 15:47:54     DATE: Tue 16. November 1999
  $Id: scluster.c,v 1.19 2003/12/19 15:06:17 cic99 Exp $

Copyright (C) 1998-2001, ZAIK/ZPR, Universit鋞 zu K鰈n

This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.

This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
Library General Public License for more details.

You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA


*******************************************************************************/

#ifdef WIN32
#  include "win_config.h"
#endif

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif

#include <stdio.h>
#undef HAVE_LIBPTHREAD
#ifdef HAVE_LIBPTHREAD
# include <pthread.h>
#endif /* HAVE_LIBPTHREAD */
#include <float.h>
#include <math.h>
#include <time.h>
#include "mprintf.h"
#include "mes.h"
#include "scluster.h"
#include "smodel.h"
#include "sreestimate.h"
#include "sfoba.h"
#include "rng.h"
#include "sequence.h"
#include "const.h"
#include "matrix.h"
#include "vector.h"
#include "smap_classify.h"

#ifdef HAVE_LIBPTHREAD
/* switch for parallel mode: (1) = sequential, (0) = parallel */ 
# define POUT 0
/* number of parallel threads */
# define THREADS 4
#else
# define POUT 1
#endif /* HAVE_LIBPTHREAD */
#define POUT 1

/*============================================================================*/

int scluster_hmm(char* argv[]) {
# define CUR_PROC "scluster_hmm"
  char *seq_file = argv[1],  *smo_file = argv[2], *out_filename = argv[3];
  int labels = atoi(argv[4]);
  int res = -1, i, iter = 0, sqd_number, idummy;
  sequence_d_t *sqd = NULL;
  sequence_d_t **sqd_vec = NULL; /* only temp. pointer */
  long j, changes = 1; 
  long *oldlabel, *smo_changed;
  double log_p, log_apo;
  double **all_log_p = NULL; /* for storing all log_p */
  FILE *outfile = NULL;
  char *str;
  char filename[1024];
  scluster_t cl;
  double eps_bw, q;
  int max_iter_bw;
  /* sreestimate_baum_welch needs this structure (introduced for parallel mode) */
  smosqd_t *cs;
#if POUT == 0
  int *return_value;
  pthread_t *tid;
  int perror;
  pthread_attr_t Attribute;
#endif /* POUT */
  cl.smo = NULL;
  cl.smo_seq = NULL;
  cl.seq_counter = NULL;
  cl.smo_Z_MD = NULL;
  cl.smo_Z_MAW = NULL;

  /* -----------------initialization -------------------------*/
  fprintf(stdout, "Clustering seqs. \"%s\"\nwith ", seq_file);
  if (CLASSIFY == 0)    fprintf(stdout, "MD-Classification\n");
  else                              fprintf(stdout, "MAW-Classification\n");
  fflush(stdout);
  /*-----------open output file----------------------------------*/
  sprintf(filename, "%s%s", out_filename, ".cl");
  if(!(outfile = mes_fopen(filename, "wt"))) {mes_proc(); goto STOP;}

/*--------------Memory allocation and data reading----------------------------*/

  /* 1 sequence array and initial models */
  scluster_print_header(outfile, argv);
  /*--- memory alloc and read data ----------------------------*/
  sqd_vec = sequence_d_read(seq_file, &sqd_number);
  if (!sqd_vec) {mes_proc(); goto STOP;}
  sqd = sqd_vec[0];
  cl.smo = smodel_read(smo_file, &cl.smo_number);
  if (!cl.smo) {mes_proc(); goto STOP;}
  /* if labels == 0 || labels == 1: no startlabels needed. */
  /* if `labels` =  3: random start labels */
  if (labels == 3) {
    fprintf(outfile, "(random start partition)n");
    fprintf(stdout, "(Random start partition...)\n");
    scluster_random_labels(sqd, cl.smo_number);
  }  

  if(!m_calloc(oldlabel, sqd->seq_number)) {mes_proc(); goto STOP;}
  for (i = 0; i < sqd->seq_number; i++)
    oldlabel[i] = (-1);
  if(!m_calloc(cl.smo_seq, cl.smo_number)) {mes_proc(); goto STOP;}
  if(!m_calloc(cl.seq_counter, cl.smo_number)) {mes_proc(); goto STOP;}
  if(!m_calloc(cl.smo_Z_MD, cl.smo_number)) {mes_proc(); goto STOP;}
  if(!m_calloc(cl.smo_Z_MAW, cl.smo_number)) {mes_proc(); goto STOP;}
  all_log_p = matrix_d_alloc(cl.smo_number, (int) sqd->seq_number);
  if (!all_log_p) {mes_proc(); goto STOP;}
  /*if (smodel_check_compatibility(cl.smo, cl.smo_number)) { 
    mes_proc(); goto STOP; 
    }*/ 
  if(!m_calloc(smo_changed, cl.smo_number)) {mes_proc(); goto STOP;}
  for (i = 0; i < cl.smo_number; i++) {
    cl.smo_seq[i] = NULL;
    smo_changed[i] = 1;
  }
  /* uninformative model prior */
  if (CLASSIFY == 1) 
    for (i = 0; i < cl.smo_number; i++) 
      cl.smo[i]->prior = 1/(double) cl.smo_number;
  /*--------for parallel mode  --------------------*/
#if POUT == 0
  /* id for threads */
  if(!m_calloc(tid, cl.smo_number)) {mes_proc(); goto STOP;} 
#endif /* POUT */
  /* data structure for  threads */
  if(!m_calloc(cs, cl.smo_number)) {mes_proc(); goto STOP;} 
  for (i = 0; i < cl.smo_number; i++)
    cs[i].smo = cl.smo[i]; 
  /* return values for each thread */
#if POUT == 0
  if(!m_calloc(return_value, cl.smo_number)) {mes_proc(); goto STOP;} 
#endif /* POUT */
#ifdef HAVE_LIBPTHREAD
  pthread_attr_init(&Attribute);
  pthread_attr_setscope(&Attribute, PTHREAD_SCOPE_SYSTEM);	
#endif /* HAVE_LIBPTHREAD */

  /* eps_bw: stopping criterion in baum-welch
     max_iter_bw: max. number of baum-welch iterations */
  eps_bw = 0.0;    /* not used */
  q = 0.1;         /* ??? */
  max_iter_bw = 20; /*MAX_ITER_BW; */
  
/*------------------------main loop-------------------------------------------*/
  /* do it until no sequence changes model; 
     attention: error function values are not used directly as a stopping criterion */
  while(changes > 0 || eps_bw > EPS_ITER_BW || max_iter_bw < MAX_ITER_BW) { 
    iter++;
    /* reset error functions and counters */
    for (i = 0; i < cl.smo_number; i++) {
      cl.seq_counter[i] = 0;
      cl.smo_Z_MD[i] = 0.0;
      cl.smo_Z_MAW[i] = 0.0;
    }

    /* set pointer of cs to sqd-field and matrix of all_logp values */
    for (i = 0; i < cl.smo_number; i++) {      
      cs[i].sqd = sqd; /* all seqs. ! */
      cs[i].logp = all_log_p[i];
    }
    /* ------------calculate logp for all seqs. and all models --------------*/
#if POUT
    /* sequential version */
    for (i = 0; i < cl.smo_number; i++) {
      if (!smo_changed[i]) continue;
      scluster_prob(&cs[i]);
    }
#else
    /* parallel version */
    i = 0;
    while (i < cl.smo_number) {
      for (j = i; j < m_min(cl.smo_number,i+THREADS); j++) {
	if (!smo_changed[j]) continue;
	if ((perror = pthread_create(&tid[j],
				    &Attribute,
				    (void*(*)(void*))scluster_prob,
				    (void*)&cs[j]))){
	  str = mprintf(NULL,0,
		"pthread_create returns false (step %d, smo[%d])\n",iter,j); 

	  mes_prot(str); m_free(str); goto STOP; 
	}
      }
      for (j = i; j < m_min(cl.smo_number,i+THREADS); j++) {
	if (smo_changed[j]) pthread_join(tid[j], NULL);
      }
      i = j;
    }
#endif


    if (iter == 1 && labels > 1) {
      /* use sequence labels as start labels */
      fprintf(outfile, "(sequences with start labels!)\n");
      fprintf(stdout, "(sequences with start labels!)\n");
    } 
    
    /*----------- classification,  sequence labeling and error function -----------*/
    if (iter == 1 && labels == 1) {
      for (i = 0; i < cl.smo_number; i++) {
	cl.seq_counter[i] = sqd->seq_number;
      }
    }
    else {
    for (j = 0; j < sqd->seq_number; j++) {
      if (iter > 1 || labels == 0) 
	    /* classification: set seq_label to ID of best_model  */
	    sqd->seq_label[j] = scluster_best_model(&cl, j, all_log_p, &log_p);
      if (sqd->seq_label[j] == -1 || sqd->seq_label[j] >= cl.smo_number) { 
	    /* no model fits! What to do?  hack: use arbitrary model ! */
	    str =  mprintf(NULL, 0, "Warning: seq. %ld, ID %.0f: scluster_best_model returns %d\n",
		       j, sqd->seq_id[j], sqd->seq_label[j]); 
	     mes_prot(str); m_free(str);
	     sqd->seq_label[j] = j % cl.smo_number;
	     /* goto STOP; */
      }
      cl.seq_counter[sqd->seq_label[j]]++; 
      /* add to error function value with seq. weights  */
      /* 1. Z_MD */
      cl.smo_Z_MD[sqd->seq_label[j]] += sqd->seq_w[j] * all_log_p[sqd->seq_label[j]][j];  
      /* 2. Z_MAW */
      if (CLASSIFY == 1) { 
	     idummy = scluster_log_aposteriori(&cl, sqd, j, &log_apo);
	  if (idummy == -1) {
	     str = mprintf(NULL, 0 , "Warn: no model fits to Seq %10.0f, use PENALTY_LOGP\n", sqd->seq_id[j]);	     
	  mes_prot(str); m_free(str);
	  cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * PENALTY_LOGP;
	  continue;
	}
	if (idummy != sqd->seq_label[j]) {
	  printf("Warn: best_model = %ld; best_bayes(logapo) = %d\n", 
		  sqd->seq_label[j], idummy);	
	}
	cl.smo_Z_MAW[sqd->seq_label[j]] += sqd->seq_w[j] * log_apo;
      }
    }   /* for (j = 0; j < sqd->seq_number; j++) */
    }  /* else */

    if (!(iter == 1 && labels == 1)) {
      if (scluster_avoid_empty_smodel(sqd, &cl)== -1) {mes_proc(); goto STOP;}
      for (i = 0; i < cl.smo_number; i++) smo_changed[i] = 0;
      changes = scluster_update_label(oldlabel, sqd->seq_label, sqd->seq_number,
				      smo_changed);
    }
    fprintf(outfile, "%ld changes in iteration %d \n", changes, iter);
    fprintf(stdout, "\n*** %ld changes in iteration %d ***\n", changes, iter);

    /* NEW: If no changes anymore: increase max_iter
       ( Alternative: change eps ) */
    if (changes == 0 && max_iter_bw < MAX_ITER_BW) {
      max_iter_bw = MAX_ITER_BW;
      changes = 1; /* so that reestimated and assigned again  */
      for (i = 0; i < cl.smo_number; i++) smo_changed[i] = 1;
      fprintf(outfile, "max_iter_bw := %d\n", max_iter_bw);
      fprintf(stdout, "*** max_iter_bw := %d ***\n", max_iter_bw);
    }
    
    /* -------Reestimate all models with assigned sequences---------------- */
    if (changes > 0) {
      if (!(iter == 1 && labels == 1))
	if (scluster_update(&cl, sqd)) { mes_proc(); goto STOP; }
      
      /* TEST: Write out k-Means-Start-Clusterung */
      if (iter == 1 && labels == 2) {
	for (i = 0; i < cl.smo_number; i++) {
	  if (cl.smo_seq[i] != NULL)
	    sequence_d_print(stdout, cl.smo_seq[i], 0);  
	}
      }

      /*if (labels == -1) {
	fprintf(outfile, "(only determining the best model for each seq.!)\n");
	fprintf(stdout, "(only determining the best model for each seq.!)\n");
	break;
      }
	*/      
      
      fprintf(outfile, "\ntotal Prob. before %d.reestimate:\n", iter);
      scluster_print_likelihood(outfile, &cl);
      
      for (i = 0; i < cl.smo_number; i++) {
		cs[i].smo = cl.smo[i]; 
		if (!(iter == 1 && labels == 1)) 
			cs[i].sqd = cl.smo_seq[i];

		cs[i].logp = &cl.smo_Z_MD[i]; 
		cs[i].eps = eps_bw;
		cs[i].max_iter = max_iter_bw;
      }


#if POUT
      /* sequential version */
      for (i = 0; i < cl.smo_number; i++) {	
	printf("SMO %d\n", i);
	if (!smo_changed[i]) continue;
	if (cs[i].sqd == NULL)
	  mes(MES_WIN, "cluster %d empty, no reestimate!\n", i);
	else if (sreestimate_baum_welch(&cs[i]) == -1) {
	  str = mprintf(NULL,0,"%d.reestimate false, smo[%d]\n", iter, i); 
	  mes_prot(str); m_free(str);
	  /* model_print(stdout, cl.mo[i]); */
	  goto STOP; 
	}
      }
#else
      /* parallel version */     
      i = 0;
      while (i < cl.smo_number) {
	for (j = i; j < m_min(cl.smo_number, i+THREADS); j++) {
	  if (!smo_changed[j]) continue;
	  if (cs[j].sqd == NULL)
	    mes(MES_WIN, "cluster %d empty, no reestimate!\n", j);
	  else if ((perror = pthread_create(&tid[j],
					   &Attribute,
					   (void*(*)(void*))sreestimate_baum_welch,
					   (void*)&cs[j]))) {
	    str = mprintf(NULL,0, 
			  "pthread_create returns false (step %d, smo[%d])\n",
			  iter, j); 
	    mes_prot(str); m_free(str);
	    goto STOP; 
	  }
	}
	for (j = i; j < m_min(cl.smo_number, i+THREADS); j++) {
	  if (!smo_changed[j]) continue;
	  if (cs[j].sqd != NULL) {
	    pthread_join(tid[j], (void**) &return_value[j]);
	    if (return_value[j] == -1) {
	      str = mprintf(NULL,0,"%d.reestimate false, smo[%d]\n", iter, j); 
	      mes_prot(str); m_free(str);
	      goto STOP; 
	    }
	  }
	}
	i = j;
      }
#endif

      /* update model priors */
    if (CLASSIFY == 1) {
	  if (sqd->total_w == 0) {
	    mes_prot("weights = 0!\n"); goto STOP;
      }
	  for (i = 0; i < cl.smo_number; i++) 
	    cl.smo[i]->prior = cl.smo_seq[i]->total_w / sqd->total_w;      
    }

      fprintf(outfile, "\ntotal Prob. after %d.reestimate:\n", iter);
      scluster_print_likelihood(outfile, &cl);
    } /* if changes ... end reestimate */
  } /* while */
  
/*------------------- OUTPUT   -----------------------------------------------*/
  
  if (scluster_out(&cl, sqd, outfile, argv) == -1) 
    { mes_proc(); goto STOP; }
scluster.c - 源码说明

本页面展示了「一个通用的隐性马尔可夫C代码库开发环境:C语言简要说明:这是一个通用的隐性马尔可夫C代码库」中的 scluster.c 源码文件，采用 C语言编程语言编写，共 768 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与马尔可夫相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?