📄 smixturehmm.c
字号:
} /* for (smo_number) */ if (smofile) fclose (smofile); exitcode = 0; /*------------------------------------------------------------------------*/STOP: /* Label STOP from ARRAY_[CM]ALLOC */ ighmm_mes (MES_WIN, "\n(%2.2T): Program finished with exitcode %d.\n", exitcode); ighmm_mes_exit (); return (exitcode);# undef CUR_PROC} /* main */#endif /* nomain *//*============================================================================*/ /* Original version, without attempt to avoid lokal traps */int ghmm_smixturehmm_cluster (FILE * outfile, double **cp, ghmm_cseq * sqd, ghmm_cmodel ** smo, int smo_number){#define CUR_PROC "ghmm_smixturehmm_cluster" int i, k, iter = 0; double likelihood, old_likelihood, delta_likelihood = 1000000.0; double total_train_w = 0.0; char *str; double *save_w; double model_weight, log_p, sum = 0.0; ghmm_cmodel_baum_welch_context *smo_sqd; /* this structure is used by sreestimate() */ ARRAY_CALLOC (smo_sqd, 1); /* smo_sqd->max_iter = MAX_ITER_BW; */ smo_sqd->max_iter = 10; smo_sqd->eps = GHMM_EPS_ITER_BW; smo_sqd->logp = &log_p; smo_sqd->sqd = sqd; ARRAY_CALLOC (save_w, sqd->seq_number); for (i = 0; i < sqd->seq_number; i++) { save_w[i] = sqd->seq_w[i]; total_train_w += save_w[i]; } for (k = 0; k < smo_number; k++) { sum = 0.0; for (i = 0; i < sqd->seq_number; i++) sum += cp[i][k] * sqd->seq_w[i]; smo[k]->prior = sum / total_train_w; } ghmm_cseq_mix_like (smo, smo_number, sqd, &old_likelihood); printf ("Initial Likelihood %.4f\n", old_likelihood); fprintf (outfile, "Initial Likelihood %.4f\n", old_likelihood); while (((-1) * delta_likelihood / old_likelihood) > 0.001 && iter < 75) { iter++; for (k = 0; k < smo_number; k++) { printf ("Model %d\n", k); smo_sqd->smo = smo[k]; model_weight = 0.0; for (i = 0; i < sqd->seq_number; i++) { /* combine seq_w with appropriate comp. prob. */ sqd->seq_w[i] = save_w[i] * cp[i][k]; model_weight += sqd->seq_w[i]; } if (ghmm_cmodel_baum_welch (smo_sqd) == -1) { str = ighmm_mprintf (NULL, 0, "Error iteration %d, model %d\n", iter, k); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } smo[k]->prior = model_weight / total_train_w; } /* for (k ...) */ /* reset weigths for smixturehmm_like */ for (i = 0; i < sqd->seq_number; i++) sqd->seq_w[i] = save_w[i]; ghmm_cseq_mix_like (smo, smo_number, sqd, &likelihood); if (ghmm_smixturehmm_calc_cp (cp, sqd, smo, smo_number, &total_train_w) == -1) { str = ighmm_mprintf (NULL, 0, "Error iteration %d\n", iter); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } printf ("Iter %d, likelihood: %.4f\n", iter, likelihood); fprintf (outfile, "BIter %d, likelihood: %.4f\n", iter, likelihood); delta_likelihood = likelihood - old_likelihood; old_likelihood = likelihood; } /* while delta_likelihood */ m_free (smo_sqd); m_free (save_w); return 0;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ m_free (smo_sqd); m_free (save_w); return -1;#undef CUR_PROC} /* ghmm_smixturehmm_cluster *//*============================================================================*//* Initial component probs. (cp) for each sequence; dimension cp: cp[sqd->seq_number] [smo_number] and (depending on cp): model priors for all models */int ghmm_smixturehmm_init (double **cp, ghmm_cseq * sqd, ghmm_cmodel ** smo, int smo_number, int mode){#define CUR_PROC "ghmm_smixturehmm_init" int i, j; double p; double *result; char *str; int bm; for (i = 0; i < sqd->seq_number; i++) for (j = 0; j < smo_number; j++) cp[i][j] = 0.0; if (mode < 1 || mode > 5) { GHMM_LOG(LCONVERTED, "Error: initial mode out of range\n"); goto STOP; } /* 1. strict random partition; cp = 0/1 */ if (mode == 1) { for (i = 0; i < sqd->seq_number; i++) { p = GHMM_RNG_UNIFORM (RNG); j = (int) floor (smo_number * p); /* ??? */ if (j < 0 || j >= smo_number) { GHMM_LOG(LCONVERTED, "Error: initial model out of range\n"); goto STOP; } cp[i][j] = 1.0; } } /* 2. ghmm_smap_bayes from initial models */ else if (mode == 2) { for (i = 0; i < sqd->seq_number; i++) if (ghmm_smap_bayes (smo, cp[i], smo_number, sqd->seq[i], sqd->seq_len[i]) == -1) { str = ighmm_mprintf (NULL, 0, "Can't determine comp. prob for seq ID %.0f \n", sqd->seq_id[i]); GHMM_LOG(LCONVERTED, str); m_free (str); /* goto STOP; */ } } /* another possibility: make partition with best model from smap_bayes... */ /* 3. cp = 1 for best model, cp = 0 for other models */ else if (mode == 3) { ARRAY_CALLOC (result, smo_number); for (i = 0; i < sqd->seq_number; i++) { bm = ghmm_smap_bayes (smo, result, smo_number, sqd->seq[i], sqd->seq_len[i]); if (bm == -1) { str = ighmm_mprintf (NULL, 0, "Can't determine comp. prob for seq ID %.0f \n", sqd->seq_id[i]); GHMM_LOG(LCONVERTED, str); m_free (str); /* goto STOP; */ } cp[i][bm] = 1.0; } m_free (result); } /* mode == 4 used to be kmeans labels */ /* 5. no start partition == equal cp for each model */ else if (mode == 5) { for (i = 0; i < sqd->seq_number; i++) for (j = 0; j < smo_number; j++) cp[i][j] = 1 / (double) smo_number; } else { printf ("Unknown Init Mode %d \n", mode); return -1; } return 0;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ return -1;#undef CUR_PROC} /* smixturehmm_compprob_init *//*============================================================================*//* currently not activated */# if 0int smixturehmm_calc_priors (double **cp, ghmm_cseq * sqd, ghmm_cmodel ** smo, int smo_number){#define CUR_PROC "smixturehmm_calc_priors" int i, k; double sum; if (total_train_w == 0) { GHMM_LOG(LCONVERTED, "total_train_w == 0!\n"); goto STOP; } for (k = 0; k < smo_number; k++) { sum = 0.0; for (i = 0; i < sqd->seq_number; i++) sum += cp[i][k] * sqd->seq_w[i]; smo[k]->prior = sum / total_train_w; printf ("\nPriors[%d] is %.6f\n", k, smo[k]->prior); } return 0;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ return -1;#undef CUR_PROC} /* smixturehmm_calc_priors */#endif/*============================================================================*//* calculate component probs for all sequences and all components *//* also recalculate total_train_w; if a seq has cp = 0 don't add its weigth to total_train_w, otherwise errors in calculating model priors occur*/int ghmm_smixturehmm_calc_cp (double **cp, ghmm_cseq * sqd, ghmm_cmodel ** smo, int smo_number, double *total_train_w){#define CUR_PROC "ghmm_smixturehmm_calc_cp" int i; char *str; double errorseqs = 0.0; *total_train_w = 0.0; for (i = 0; i < sqd->seq_number; i++) if (ghmm_smap_bayes (smo, cp[i], smo_number, sqd->seq[i], sqd->seq_len[i]) == -1) { /* all cp[i] [ . ] are set to zero; seq. will be ignored for reestimation!!! */ str = ighmm_mprintf (NULL, 0, "Warning[%d]: Can't determine comp. prob for seq ID %.0f\n", i, sqd->seq_id[i]); GHMM_LOG(LCONVERTED, str); m_free (str); errorseqs++; if (errorseqs > 0.1 * (double) sqd->seq_number) { printf ("errorseqs %.1f, max false %.1f\n", errorseqs, 0.1 * (double) sqd->seq_number); GHMM_LOG(LCONVERTED, "max. no of errors from ghmm_smap_bayes exceeded\n"); goto STOP; } } else *total_train_w += sqd->seq_w[i]; return 0;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ return -1;#undef CUR_PROC} /* ghmm_smixturehmm_calc_cp *//*============================================================================*//* Is currently not used. Use this function in calc_cp and smixturehmm_like later on (saves half of the ghmm_cmodel_logp() calls). Danger: Is it neccessary to take seq_w into account? */void ghmm_smixture_calc_logp (double **logp, int **error, ghmm_cseq * sqd, ghmm_cmodel ** smo, int smo_number){ int i, k; for (i = 0; i < sqd->seq_number; i++) for (k = 0; k < smo_number; k++) { if (ghmm_cmodel_logp (smo[k], sqd->seq[i], sqd->seq_len[i], &(logp[i][k])) == -1) error[i][k] = 1; else error[i][k] = 0; }}/*============================================================================*//* flag == 1 --> Header for .like-file, else --> other file */void ghmm_smixturehmm_print_header (FILE * file, char *argv[], int flag){ time_t zeit; int mode = atoi (argv[9]); time (&zeit); fprintf (file, "\n************************************************************************\n"); fprintf (file, "Date: %ssmixturehmm:\n", ctime ((const time_t *) &zeit)); fprintf (file, "Seq. File\t%s\nInit-model File\t%s\nInit-Mode\t", argv[1], argv[2]); switch (mode) { case 1: fprintf (file, "%d SP_ZUF (random start partition)\n", mode); break; case 2: fprintf (file, "%d SP_VERT (distr. accord smap_bayes)\n", mode); break; case 3: fprintf (file, "%d SP_BEST (best model)\n", mode); break; case 4: fprintf (file, "%d SP_KM (partition from k-means)\n", mode); break; case 5: fprintf (file, "%d NO_SP (no start partition)\n", mode); break; default: fprintf (file, "???\n"); break; } fprintf (file, "Train Ratio\t %.4f\n\n", atof (argv[7])); if (flag == 1) /* only in .like-File */ fprintf (file, "smo no.\tCV Iter\t SeqTrain\tSeqTest\tLikeTrain\tLikeTest\tavrgTrain\tavrgTest\tErrorTrain\tErrorTest\tBIC\n");}/*============================================================================*//* calculates for each model the average likelihood per seq. weight: avg_like[j] = sum_i (cp[i][j] * seq_w[i] * log_p(i | j) ) / sum_i (cp[i][j] * seq_w[i]). Usefull for identifying models with poor likelihood*/double *ghmm_smixturehmm_avg_like (double **cp, ghmm_cseq * sqd, ghmm_cmodel ** smo, int smo_number){#define CUR_PROC "ghmm_smixturehmm_avg_like" double *avg_like = NULL; int i, k; double num = 0.0, denom = 0.0, log_p = 0.0; ARRAY_CALLOC (avg_like, smo_number); for (k = 0; k < smo_number; k++) { num = denom = 0.0; for (i = 0; i < sqd->seq_number; i++) { if (ghmm_cmodel_logp (smo[k], sqd->seq[i], sqd->seq_len[i], &log_p) != -1) { num += cp[i][k] * sqd->seq_w[i] * log_p; denom += cp[i][k] * sqd->seq_w[i]; } } if (denom > 0) avg_like[k] = num / denom; else avg_like[k] = -1; } /* for models k ... */ return avg_like;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ return NULL;#undef CUR_PROC} /* ghmm_smixturehmm_avg_like */#endif /* GHMM_OBSOLETE */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -