scluster.c
来自「General Hidden Markov Model Library 一个通用」· C语言 代码 · 共 935 行 · 第 1/2 页
C
935 行
} /*--------------------------------------------------------------------------*/ res = 0;STOP: /* Label STOP from ARRAY_[CM]ALLOC */#if POUT == 0 pthread_attr_destroy (&Attribute); #endif /* POUT */ /* XXX ...still div. free! XXX */ if (outfile) fclose (outfile); return (res); # undef CUR_PROC} /* ghmm_scluster_hmm *//*============================================================================*/ int ghmm_scluster_t_free (scluster_t * scl){ int i; #define CUR_PROC "ghmm_scluster_t_free" mes_check_ptr (scl, return (-1)); if (!scl) return (0); for (i = 0; i <= scl->smo_number; i++) { ghmm_cmodel_free (&(scl->smo[i])); ghmm_cseq_free (&(scl->smo_seq[i])); } printf ("hier1\n"); m_free (scl->smo); m_free (scl->smo_seq); m_free (scl->seq_counter); m_free (scl->smo_Z_MD); m_free (scl->smo_Z_MAW); return 0; # undef CUR_PROC}/*============================================================================*/ int ghmm_scluster_out (scluster_t * cl, ghmm_cseq * sqd, FILE * outfile, char *argv[]){ #define CUR_PROC "ghmm_scluster_out" int res = -1, i; char filename[128]; char *out_filename = argv[3]; FILE * out_model = NULL; /*fprintf(outfile, "\nFinal Models:\n"); for (i = 0; i < cl->smo_number; i++) { fprintf(outfile, "smodel[%d]:\n", i); ghmm_cmodel_print(outfile, cl->smo[i]); fprintf(outfile, "Sequences of smodel[%d] (# %ld, total weight %.0f)\n", i, cl->smo_seq[i]->seq_number, cl->smo_seq[i]->total_w);*/ /*for (k = 0; k < sqd->seq_number; k++) {*/ /*if (sqd->seq_label[k] == i) {*/ /* fuer Wetterdaten: Numerierung von 1 - .. */ /*fprintf(outfile, "\t%4d\t|%.0f|\t", k+1, sqd->seq_w[k]);*/ /*ighmm_cvector_print(outfile, sqd->seq[k], sqd->seq_len[k]," "," ","");*/ /*} */ /*}*/ /* fprintf(outfile, "(%ld sequences)\n\n", cl->seq_counter[i]); }*/sprintf (filename, "%s.smo", out_filename); if (!(out_model = ighmm_mes_fopen (filename, "wt"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } ghmm_scluster_print_header (out_model, argv); for (i = 0; i < cl->smo_number; i++) { fprintf (out_model, "#trained smodel[%d]:\n", i); ghmm_cmodel_print (out_model, cl->smo[i]); } fclose (out_model); /* Output from all sequences in HMM-format; different clusters make seperate lists */ fclose (out_model); sprintf (filename, "%s.sqd", out_filename); if (!(out_model = ighmm_mes_fopen (filename, "wt"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } ghmm_scluster_print_header (out_model, argv); for (i = 0; i < cl->smo_number; i++) { if (cl->smo_seq[i] != NULL) ghmm_cseq_print (out_model, cl->smo_seq[i], 0); } /* Output from all sequences in one row with clusterlabel */ /* ghmm_cseq_print(out_model, sqd, 0); */ /* Output: The number of sequences and sequence weights pro cluster, respektively (for the generator) */ fclose (out_model); sprintf (filename, "%s.numbers", out_filename); if (!(out_model = ighmm_mes_fopen (filename, "wt"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } ghmm_scluster_print_header (out_model, argv); fprintf (out_model, "numbers = {\n"); fprintf (out_model, "# Clusterung mit Gewichten --> in BS/10, sonst Anzahl Seqs.\n"); /* Weights */ if (cl->smo_seq[0]->total_w > cl->smo_seq[0]->seq_number) { for (i = 0; i < cl->smo_number - 1; i++) fprintf (out_model, "%.0f,\n", 0.1 * cl->smo_seq[i]->total_w); fprintf (out_model, "%.0f;\n};", 0.1 * cl->smo_seq[cl->smo_number - 1]->total_w); } /* Number */ else { for (i = 0; i < cl->smo_number - 1; i++) fprintf (out_model, "%ld,\n", cl->seq_counter[i]); fprintf (out_model, "%ld;\n};", cl->seq_counter[cl->smo_number - 1]); } res = 0;STOP:if (out_model) fclose (out_model); return (res); #undef CUR_PROC} /* ghmm_scluster_out *//*============================================================================*/ /* Memory for sequences for each model is allocated only once and not done with realloc for each sequence as before. */ int ghmm_scluster_update (scluster_t * cl, ghmm_cseq * sqd){ #define CUR_PROC "ghmm_scluster_update" int i; ghmm_cseq * seq_ptr; /* Allocate memoery block by block */ for (i = 0; i < cl->smo_number; i++) { if (cl->smo_seq[i]) { /* Important: No ghmm_dseq_free here, because then the originals will be lost. */ ghmm_cseq_clean (cl->smo_seq[i]); m_free (cl->smo_seq[i]); } if (cl->seq_counter[i] != 0) { cl->smo_seq[i] = ghmm_cseq_calloc (cl->seq_counter[i]); cl->smo_seq[i]->seq_number = 0; /* counted below */ } else cl->smo_seq[i] = NULL; } /* Set entries */ for (i = 0; i < sqd->seq_number; i++) { seq_ptr = cl->smo_seq[sqd->seq_label[i]]; seq_ptr->seq_len[seq_ptr->seq_number] = sqd->seq_len[i]; seq_ptr->seq_id[seq_ptr->seq_number] = sqd->seq_id[i]; seq_ptr->seq[seq_ptr->seq_number] = sqd->seq[i]; /* Pointer!!! */ seq_ptr->seq_label[seq_ptr->seq_number] = sqd->seq_label[i]; seq_ptr->seq_w[seq_ptr->seq_number] = sqd->seq_w[i]; seq_ptr->seq_number++; seq_ptr->total_w += sqd->seq_w[i]; } return (0); # undef CUR_PROC} /* ghmm_scluster_update *//*============================================================================*/ void ghmm_scluster_print_likelihood (FILE * outfile, scluster_t * cl){ double total_Z_MD = 0.0, total_Z_MAW = 0.0; int i; for (i = 0; i < cl->smo_number; i++) { total_Z_MD += cl->smo_Z_MD[i]; total_Z_MAW += cl->smo_Z_MAW[i]; fprintf (outfile, "smo %d\t(#Seq. %4ld):\t", i, cl->seq_counter[i]); if (CLASSIFY == 0) fprintf (outfile, "ZMD:%8.2f", cl->smo_Z_MD[i]); else fprintf (outfile, "ZMAW:%8.2f", cl->smo_Z_MAW[i]); fprintf (outfile, "\n"); } fprintf (outfile, "sum\t"); if (CLASSIFY == 0) fprintf (outfile, "ZMD: %12.5f", total_Z_MD); else fprintf (outfile, "ZMAW: %12.5f", total_Z_MAW); fprintf (outfile, "\n\n"); if (CLASSIFY == 0) printf ("total error function (ZMD): %15.4f\n", total_Z_MD); else printf ("total error function (ZMAW): %15.4f\n", total_Z_MAW);} /* ghmm_scluster_print_likelihood *//*============================================================================*/ /* Avoids empty models going out as outputs by assigning them a random sequence. This may lead to a produce of a new empty model - therefore change out sequences until a non empty model is found. (Quit after 100 iterations to avoid a infinite loop). */ int ghmm_scluster_avoid_empty_smodel (ghmm_cseq * sqd, scluster_t * cl){ #define CUR_PROC "ghmm_scluster_avoid_empty_smodel" int i, best_smo; long i_old, j = 0; char error = 1, change = 0; int iter = 0; double log_p_minus, log_p_plus; double *result = NULL; /* If only one Model or one sequencet: Nothing to be done */ if (sqd->seq_number < 2 || cl->smo_number < 2) return (0); if (CLASSIFY == 1) ARRAY_CALLOC (result, cl->smo_number); while (error && iter < 100) { iter++; error = 0; for (i = 0; i < cl->smo_number; i++) { if (cl->seq_counter[i] <= 1) { change = 1; /* Choose a random sequence for an empty model */ j = m_int (GHMM_RNG_UNIFORM (RNG) * (sqd->seq_number - 1)); /* If it's not possible to produce a sequence for an empty model: Quit */ if (ghmm_cmodel_logp (cl->smo[i], sqd->seq[j], sqd->seq_len[j], &log_p_plus) == -1) continue; if (CLASSIFY == 1) { best_smo = ghmm_smap_bayes (cl->smo, result, cl->smo_number, sqd->seq[j], sqd->seq_len[j]); if (best_smo == -1) continue; } i_old = sqd->seq_label[j]; /* updates */ printf ("!!\"avoid_empty_model\": move seq. %ld: %ld --> %d !!\n", j, i_old, i); cl->seq_counter[i_old]--; cl->seq_counter[i]++; sqd->seq_label[j] = i; /* smo_Z_MD update */ if (ghmm_cmodel_logp (cl->smo[i_old], sqd->seq[j], sqd->seq_len[j], &log_p_minus) == -1) { GHMM_LOG(LCONVERTED, "ghmm_cmodel_logp returns -1!\n"); goto STOP; }; cl->smo_Z_MD[i_old] -= sqd->seq_w[j] * log_p_minus; cl->smo_Z_MD[i] += sqd->seq_w[j] * log_p_plus; /* smo_Z_MAW update */ if (CLASSIFY == 1) { cl->smo_Z_MAW[i_old] -= sqd->seq_w[j] * result[i_old]; cl->smo_Z_MAW[i] += sqd->seq_w[j] * result[i]; } } } /* Is now everything OK ? */ if (change) { for (i = 0; i < cl->smo_number; i++) { if (cl->seq_counter[i] <= 1) { error = 1; break; } } } } /* while */ if (!error) return (0);STOP: if (result){ m_free (result); } return (-1); #undef CUR_PROC} /* ghmm_scluster_avoid_empty_smodel *//*============================================================================*/ long ghmm_scluster_update_label (long *oldlabel, long *seq_label, long seq_number, long *smo_changed){ long i, changes = 0; for (i = 0; i < seq_number; i++) if (oldlabel[i] != seq_label[i]) { changes++; smo_changed[oldlabel[i]] = smo_changed[seq_label[i]] = 1; oldlabel[i] = seq_label[i]; } return changes;} /* ghmm_scluster_update_label *//*============================================================================*/ /* Determines from an already calculated probability matrix, which model fits best to the sequence with the ID seq_id. */ int ghmm_scluster_best_model (scluster_t * cl, long seq_id, double **all_log_p, double *log_p){ #define CUR_PROC "ghmm_scluster_best_model" int i, smo_id; double save = -DBL_MAX; *log_p = -DBL_MAX; smo_id = -1; /* MD-Classify: argmax_i (log(p(O | \lambda_i ))) */ if (CLASSIFY == 0) { for (i = 0; i < cl->smo_number; i++) { if (all_log_p[i][seq_id] == GHMM_PENALTY_LOGP) continue; /* model and seq. don't fit */ if (*log_p < all_log_p[i][seq_id]) { *log_p = all_log_p[i][seq_id]; smo_id = i; } } } /* MAW-Classify: argmax_i (log(p(O | \lambda_i )) + log(p( \lambda_i))) */ else { for (i = 0; i < cl->smo_number; i++) { if (cl->smo[i]->prior == -1) { GHMM_LOG(LCONVERTED, "Error! Need model prior for MAW-classification\n"); } if (cl->smo[i]->prior != 0) { if (all_log_p[i][seq_id] == GHMM_PENALTY_LOGP) continue; /* model and seq. don't fit */ if (save < all_log_p[i][seq_id] + log (cl->smo[i]->prior)) { save = all_log_p[i][seq_id] + log (cl->smo[i]->prior); *log_p = all_log_p[i][seq_id]; smo_id = i; } } } } return (smo_id); #undef CUR_PROC} /* ghmm_scluster_best_model *//*============================================================================*/ void ghmm_scluster_prob (ghmm_cmodel_baum_welch_context * cs){ int i; /*printf("seq_num = %d\n", cs->sqd->seq_number);*/ for (i = 0; i < cs->sqd->seq_number; i++) if (ghmm_cmodel_logp (cs->smo, cs->sqd->seq[i], cs->sqd->seq_len[i], &(cs->logp[i])) == -1) cs->logp[i] = (double) GHMM_PENALTY_LOGP; /* Penalty costs */} /* ghmm_scluster_prob */ /*============================================================================*/ int ghmm_scluster_random_labels (ghmm_cseq * sqd, int smo_number){ #define CUR_PROC "ghmm_scluster_random_labels" int j, label; for (j = 0; j < sqd->seq_number; j++) { label = m_int (GHMM_RNG_UNIFORM (RNG) * (smo_number - 1)); sqd->seq_label[j] = label; } return (0); # undef CUR_PROC} /* ghmm_scluster_random_labels *//*============================================================================*/ int ghmm_scluster_log_aposteriori (scluster_t * cl, ghmm_cseq * sqd, int seq_id, double *log_apo){ #define CUR_PROC "ghmm_scluster_log_aposteriori" double *result = NULL; int best_smo = -1; ARRAY_CALLOC (result, cl->smo_number); best_smo = ghmm_smap_bayes (cl->smo, result, cl->smo_number, sqd->seq[seq_id], sqd->seq_len[seq_id]); if (best_smo == -1) { GHMM_LOG(LCONVERTED, "best_smo == -1 !\n"); goto STOP; } else *log_apo = result[best_smo]; /* TEST */ /* printf("result: "); for (i = 0; i< cl->smo_number; i++) printf("%.2f ", result[i]); printf("\n"); */ STOP:m_free (result); return best_smo; # undef CUR_PROC} /* ghmm_scluster_log_aposteriori *//*============================================================================*/ void ghmm_scluster_print_header (FILE * file, char *argv[]){ time_t zeit; time (&zeit); fprintf (file, "# Date: %s# scluster:\n", ctime ((const time_t *) &zeit)); fprintf (file, "# Sequence File: %s\n", argv[1]); fprintf (file, "# Model File: %s\n", argv[2]); fprintf (file, "# Start Partion Label: "); switch (atoi (argv[4])) { case 0: fprintf (file, "SP_BEST (best model)\n\n"); break; case 1: fprintf (file, "NO_SP (no start partition)\n\n"); break; case 2: fprintf (file, "SP_KM (partition from k-means)\n\n"); break; case 3: fprintf (file, "SP_ZUF (random start partition)\n\n"); break; default: fprintf (file, "???\n\n"); }}#if POUT == 0#undef THREADS#endif /* HAVE_LIBPTHREAD */#undef POUT#endif /* GHMM_OBSOLETE */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?