scluster.c

来自「General Hidden Markov Model Library 一个通用」· C语言 代码 · 共 935 行 · 第 1/2 页

C
935
字号
    }  /*--------------------------------------------------------------------------*/     res = 0;STOP:     /* Label STOP from ARRAY_[CM]ALLOC */#if POUT == 0    pthread_attr_destroy (&Attribute);  #endif  /* POUT */    /* XXX ...still div. free! XXX */     if (outfile)    fclose (outfile);  return (res);  # undef CUR_PROC}                               /* ghmm_scluster_hmm *//*============================================================================*/ int ghmm_scluster_t_free (scluster_t * scl){  int i;  #define CUR_PROC "ghmm_scluster_t_free"    mes_check_ptr (scl, return (-1));  if (!scl)    return (0);  for (i = 0; i <= scl->smo_number; i++) {    ghmm_cmodel_free (&(scl->smo[i]));    ghmm_cseq_free (&(scl->smo_seq[i]));  }  printf ("hier1\n");  m_free (scl->smo);  m_free (scl->smo_seq);  m_free (scl->seq_counter);  m_free (scl->smo_Z_MD);  m_free (scl->smo_Z_MAW);  return 0;  # undef CUR_PROC}/*============================================================================*/ int ghmm_scluster_out (scluster_t * cl, ghmm_cseq * sqd, FILE * outfile,                   char *argv[]){  #define CUR_PROC "ghmm_scluster_out"  int res = -1, i;  char filename[128];  char *out_filename = argv[3];  FILE * out_model = NULL;    /*fprintf(outfile, "\nFinal Models:\n");    for (i = 0; i < cl->smo_number; i++) {    fprintf(outfile, "smodel[%d]:\n", i);    ghmm_cmodel_print(outfile, cl->smo[i]);    fprintf(outfile, "Sequences of smodel[%d] (# %ld, total weight %.0f)\n",    i, cl->smo_seq[i]->seq_number,     cl->smo_seq[i]->total_w);*/              /*for (k = 0; k < sqd->seq_number; k++) {*/       /*if (sqd->seq_label[k] == i) {*/       /* fuer Wetterdaten: Numerierung von 1 - .. */       /*fprintf(outfile, "\t%4d\t|%.0f|\t", k+1, sqd->seq_w[k]);*/       /*ighmm_cvector_print(outfile, sqd->seq[k], sqd->seq_len[k]," "," ","");*/       /*}     */       /*}*/  /*    fprintf(outfile, "(%ld sequences)\n\n", cl->seq_counter[i]);    }*/sprintf (filename, "%s.smo", out_filename);  if (!(out_model = ighmm_mes_fopen (filename, "wt"))) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  ghmm_scluster_print_header (out_model, argv);  for (i = 0; i < cl->smo_number; i++) {    fprintf (out_model, "#trained smodel[%d]:\n", i);    ghmm_cmodel_print (out_model, cl->smo[i]);  }  fclose (out_model);      /* Output from all sequences in HMM-format; different clusters       make seperate lists */     fclose (out_model);  sprintf (filename, "%s.sqd", out_filename);  if (!(out_model = ighmm_mes_fopen (filename, "wt"))) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  ghmm_scluster_print_header (out_model, argv);  for (i = 0; i < cl->smo_number; i++) {    if (cl->smo_seq[i] != NULL)      ghmm_cseq_print (out_model, cl->smo_seq[i], 0);  }      /* Output from all sequences in one row with clusterlabel */     /* ghmm_cseq_print(out_model, sqd, 0); */         /* Output: The number of sequences and sequence weights pro cluster,       respektively (for the generator) */     fclose (out_model);  sprintf (filename, "%s.numbers", out_filename);  if (!(out_model = ighmm_mes_fopen (filename, "wt"))) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  ghmm_scluster_print_header (out_model, argv);  fprintf (out_model, "numbers = {\n");  fprintf (out_model,            "# Clusterung mit Gewichten --> in BS/10, sonst Anzahl Seqs.\n");      /* Weights */     if (cl->smo_seq[0]->total_w > cl->smo_seq[0]->seq_number) {    for (i = 0; i < cl->smo_number - 1; i++)      fprintf (out_model, "%.0f,\n", 0.1 * cl->smo_seq[i]->total_w);    fprintf (out_model, "%.0f;\n};",              0.1 * cl->smo_seq[cl->smo_number - 1]->total_w);  }      /* Number */     else {    for (i = 0; i < cl->smo_number - 1; i++)      fprintf (out_model, "%ld,\n", cl->seq_counter[i]);    fprintf (out_model, "%ld;\n};", cl->seq_counter[cl->smo_number - 1]);  }  res = 0;STOP:if (out_model)    fclose (out_model);  return (res);  #undef CUR_PROC}                               /* ghmm_scluster_out *//*============================================================================*/ /* Memory for sequences for each model is allocated only once and not done with   realloc for each sequence as before. */ int ghmm_scluster_update (scluster_t * cl, ghmm_cseq * sqd){  #define CUR_PROC "ghmm_scluster_update"  int i;  ghmm_cseq * seq_ptr;      /* Allocate memoery block by block */     for (i = 0; i < cl->smo_number; i++) {    if (cl->smo_seq[i]) {              /* Important: No ghmm_dseq_free here, because then the originals  will be lost. */         ghmm_cseq_clean (cl->smo_seq[i]);      m_free (cl->smo_seq[i]);    }    if (cl->seq_counter[i] != 0) {      cl->smo_seq[i] = ghmm_cseq_calloc (cl->seq_counter[i]);      cl->smo_seq[i]->seq_number = 0;  /* counted below */    }        else      cl->smo_seq[i] = NULL;  }      /* Set entries */     for (i = 0; i < sqd->seq_number; i++) {    seq_ptr = cl->smo_seq[sqd->seq_label[i]];    seq_ptr->seq_len[seq_ptr->seq_number] = sqd->seq_len[i];    seq_ptr->seq_id[seq_ptr->seq_number] = sqd->seq_id[i];    seq_ptr->seq[seq_ptr->seq_number] = sqd->seq[i];   /* Pointer!!! */    seq_ptr->seq_label[seq_ptr->seq_number] = sqd->seq_label[i];    seq_ptr->seq_w[seq_ptr->seq_number] = sqd->seq_w[i];    seq_ptr->seq_number++;    seq_ptr->total_w += sqd->seq_w[i];  }  return (0);  # undef CUR_PROC}                               /* ghmm_scluster_update *//*============================================================================*/ void ghmm_scluster_print_likelihood (FILE * outfile, scluster_t * cl){  double total_Z_MD = 0.0, total_Z_MAW = 0.0;  int i;  for (i = 0; i < cl->smo_number; i++) {    total_Z_MD += cl->smo_Z_MD[i];    total_Z_MAW += cl->smo_Z_MAW[i];    fprintf (outfile, "smo %d\t(#Seq. %4ld):\t", i, cl->seq_counter[i]);    if (CLASSIFY == 0)      fprintf (outfile, "ZMD:%8.2f", cl->smo_Z_MD[i]);        else      fprintf (outfile, "ZMAW:%8.2f", cl->smo_Z_MAW[i]);    fprintf (outfile, "\n");  }  fprintf (outfile, "sum\t");  if (CLASSIFY == 0)    fprintf (outfile, "ZMD: %12.5f", total_Z_MD);    else    fprintf (outfile, "ZMAW: %12.5f", total_Z_MAW);  fprintf (outfile, "\n\n");  if (CLASSIFY == 0)    printf ("total error function (ZMD): %15.4f\n", total_Z_MD);    else    printf ("total error function (ZMAW): %15.4f\n", total_Z_MAW);}                              /* ghmm_scluster_print_likelihood *//*============================================================================*/ /* Avoids empty models going out as outputs by assigning them a random    sequence. This may lead to a produce of a new empty model - therefore   change out sequences until a non empty model is found. (Quit after 100    iterations to avoid a infinite loop). */ int ghmm_scluster_avoid_empty_smodel (ghmm_cseq * sqd, scluster_t * cl){  #define CUR_PROC "ghmm_scluster_avoid_empty_smodel"  int i, best_smo;  long i_old, j = 0;  char error = 1, change = 0;  int iter = 0;  double log_p_minus, log_p_plus;  double *result = NULL;      /* If only one Model or one sequencet: Nothing to be done */     if (sqd->seq_number < 2 || cl->smo_number < 2)    return (0);  if (CLASSIFY == 1)    ARRAY_CALLOC (result, cl->smo_number);  while (error && iter < 100) {    iter++;    error = 0;    for (i = 0; i < cl->smo_number; i++) {      if (cl->seq_counter[i] <= 1) {        change = 1;                  /* Choose a random sequence for an empty model */           j = m_int (GHMM_RNG_UNIFORM (RNG) * (sqd->seq_number - 1));                  /* If it's not possible to produce a sequence for an empty model: Quit */           if (ghmm_cmodel_logp              (cl->smo[i], sqd->seq[j], sqd->seq_len[j], &log_p_plus) == -1)          continue;        if (CLASSIFY == 1) {          best_smo =            ghmm_smap_bayes (cl->smo, result, cl->smo_number, sqd->seq[j],                        sqd->seq_len[j]);          if (best_smo == -1)            continue;        }        i_old = sqd->seq_label[j];                  /* updates  */           printf ("!!\"avoid_empty_model\": move seq. %ld: %ld --> %d !!\n",                  j, i_old, i);        cl->seq_counter[i_old]--;        cl->seq_counter[i]++;        sqd->seq_label[j] = i;                  /* smo_Z_MD update */           if (ghmm_cmodel_logp              (cl->smo[i_old], sqd->seq[j], sqd->seq_len[j],               &log_p_minus) == -1)           {          GHMM_LOG(LCONVERTED, "ghmm_cmodel_logp returns -1!\n");          goto STOP;          };        cl->smo_Z_MD[i_old] -= sqd->seq_w[j] * log_p_minus;        cl->smo_Z_MD[i] += sqd->seq_w[j] * log_p_plus;                  /* smo_Z_MAW update */           if (CLASSIFY == 1) {          cl->smo_Z_MAW[i_old] -= sqd->seq_w[j] * result[i_old];          cl->smo_Z_MAW[i] += sqd->seq_w[j] * result[i];        }      }    }          /* Is now everything OK ? */       if (change) {      for (i = 0; i < cl->smo_number; i++) {        if (cl->seq_counter[i] <= 1) {          error = 1;          break;        }      }    }  }                            /* while */  if (!error)    return (0);STOP:  if (result){    m_free (result);  }   return (-1);  #undef CUR_PROC}                               /* ghmm_scluster_avoid_empty_smodel *//*============================================================================*/ long ghmm_scluster_update_label (long *oldlabel, long *seq_label, long seq_number,                            long *smo_changed){  long i, changes = 0;  for (i = 0; i < seq_number; i++)    if (oldlabel[i] != seq_label[i]) {      changes++;      smo_changed[oldlabel[i]] = smo_changed[seq_label[i]] = 1;      oldlabel[i] = seq_label[i];    }  return changes;}                              /* ghmm_scluster_update_label *//*============================================================================*/   /* Determines from an already calculated probability matrix, which model    fits best to the sequence with the ID seq_id. */ int ghmm_scluster_best_model (scluster_t * cl, long seq_id, double **all_log_p,                         double *log_p){  #define CUR_PROC "ghmm_scluster_best_model"  int i, smo_id;  double save = -DBL_MAX;  *log_p = -DBL_MAX;  smo_id = -1;      /* MD-Classify: argmax_i (log(p(O | \lambda_i ))) */     if (CLASSIFY == 0) {    for (i = 0; i < cl->smo_number; i++) {      if (all_log_p[i][seq_id] == GHMM_PENALTY_LOGP)        continue;               /* model and seq. don't fit */      if (*log_p < all_log_p[i][seq_id]) {        *log_p = all_log_p[i][seq_id];        smo_id = i;      }    }  }      /* MAW-Classify: argmax_i (log(p(O | \lambda_i )) + log(p( \lambda_i))) */     else {    for (i = 0; i < cl->smo_number; i++) {      if (cl->smo[i]->prior == -1) {        GHMM_LOG(LCONVERTED, "Error! Need model prior for MAW-classification\n");      }      if (cl->smo[i]->prior != 0) {        if (all_log_p[i][seq_id] == GHMM_PENALTY_LOGP)          continue;             /* model and seq. don't fit */        if (save < all_log_p[i][seq_id] + log (cl->smo[i]->prior)) {          save = all_log_p[i][seq_id] + log (cl->smo[i]->prior);          *log_p = all_log_p[i][seq_id];          smo_id = i;        }      }    }  }  return (smo_id);  #undef CUR_PROC}                               /* ghmm_scluster_best_model *//*============================================================================*/ void ghmm_scluster_prob (ghmm_cmodel_baum_welch_context * cs){  int i;      /*printf("seq_num = %d\n", cs->sqd->seq_number);*/    for (i = 0; i < cs->sqd->seq_number; i++)    if (ghmm_cmodel_logp         (cs->smo, cs->sqd->seq[i], cs->sqd->seq_len[i],          &(cs->logp[i])) == -1)      cs->logp[i] = (double) GHMM_PENALTY_LOGP;     /*  Penalty costs */} /* ghmm_scluster_prob */   /*============================================================================*/ int ghmm_scluster_random_labels (ghmm_cseq * sqd, int smo_number){  #define CUR_PROC "ghmm_scluster_random_labels"  int j, label;  for (j = 0; j < sqd->seq_number; j++) {    label = m_int (GHMM_RNG_UNIFORM (RNG) * (smo_number - 1));    sqd->seq_label[j] = label;  }  return (0);  # undef CUR_PROC}                               /* ghmm_scluster_random_labels *//*============================================================================*/ int ghmm_scluster_log_aposteriori (scluster_t * cl, ghmm_cseq * sqd,                               int seq_id, double *log_apo){  #define CUR_PROC "ghmm_scluster_log_aposteriori"  double *result = NULL;  int best_smo = -1;  ARRAY_CALLOC (result, cl->smo_number);  best_smo =    ghmm_smap_bayes (cl->smo, result, cl->smo_number, sqd->seq[seq_id],                sqd->seq_len[seq_id]);  if (best_smo == -1) {    GHMM_LOG(LCONVERTED, "best_smo == -1 !\n");    goto STOP;  }    else    *log_apo = result[best_smo];      /* TEST */     /*  printf("result: ");       for (i = 0; i< cl->smo_number; i++)       printf("%.2f ", result[i]); printf("\n");     */ STOP:m_free (result);  return best_smo;  # undef CUR_PROC}                               /* ghmm_scluster_log_aposteriori *//*============================================================================*/ void ghmm_scluster_print_header (FILE * file, char *argv[]){  time_t zeit;  time (&zeit);  fprintf (file, "# Date: %s# scluster:\n", ctime ((const time_t *) &zeit));  fprintf (file, "# Sequence File: %s\n", argv[1]);  fprintf (file, "# Model File: %s\n", argv[2]);  fprintf (file, "# Start Partion Label: ");  switch (atoi (argv[4])) {  case 0:    fprintf (file, "SP_BEST (best model)\n\n");    break;  case 1:    fprintf (file, "NO_SP (no start partition)\n\n");    break;  case 2:    fprintf (file, "SP_KM (partition from k-means)\n\n");    break;  case 3:    fprintf (file, "SP_ZUF (random start partition)\n\n");    break;  default:    fprintf (file, "???\n\n");  }}#if POUT == 0#undef THREADS#endif  /* HAVE_LIBPTHREAD */#undef POUT#endif /* GHMM_OBSOLETE */

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?