📄 smixturehmm.c
字号:
smodel **smo, int smo_number) {#define CUR_PROC "smixturehmm_cluster" int i, k, iter = 0; double likelihood, old_likelihood, delta_likelihood = 1000000.0; double total_train_w = 0.0; char *str; double *save_w; double model_weight, log_p, sum = 0.0; smosqd_t *smo_sqd; /* this structure is used by sreestimate() */ if(!m_calloc(smo_sqd, 1)) {mes_proc(); goto STOP;} /* smo_sqd->max_iter = MAX_ITER_BW; */ smo_sqd->max_iter = 10; smo_sqd->eps = EPS_ITER_BW; smo_sqd->logp = &log_p; smo_sqd->sqd = sqd; if(!m_calloc(save_w, sqd->seq_number)) {mes_proc(); goto STOP;} for (i = 0; i < sqd->seq_number; i++) { save_w[i] = sqd->seq_w[i]; total_train_w += save_w[i]; } for (k = 0; k < smo_number; k++) { sum = 0.0; for (i = 0; i < sqd->seq_number; i++) sum += cp[i] [k] * sqd->seq_w[i]; smo[k]->prior = sum / total_train_w; } sequence_d_mix_like(smo, smo_number, sqd, &old_likelihood); printf("Initial Likelihood %.4f\n", old_likelihood); fprintf(outfile, "Initial Likelihood %.4f\n", old_likelihood); while (((-1)*delta_likelihood/old_likelihood) > 0.001 && iter < 75) { iter++; for (k = 0; k < smo_number; k++) { printf("Model %d\n", k); smo_sqd->smo = smo[k]; model_weight = 0.0; for (i = 0; i < sqd->seq_number; i++) { /* combine seq_w with appropriate comp. prob. */ sqd->seq_w[i] = save_w[i] * cp[i][k]; model_weight += sqd->seq_w[i]; } if (sreestimate_baum_welch(smo_sqd) == -1) { str = mprintf(NULL, 0, "Error iteration %d, model %d\n", iter, k); mes_prot(str); m_free(str); goto STOP; } smo[k]->prior = model_weight / total_train_w; } /* for (k ...) */ /* reset weigths for smixturehmm_like */ for (i = 0; i < sqd->seq_number; i++) sqd->seq_w[i] = save_w[i]; sequence_d_mix_like(smo, smo_number, sqd, &likelihood); if (smixturehmm_calc_cp(cp, sqd, smo, smo_number, &total_train_w) == -1) { str = mprintf(NULL, 0, "Error iteration %d\n", iter); mes_prot(str); m_free(str); goto STOP; } printf("Iter %d, likelihood: %.4f\n", iter, likelihood); fprintf(outfile, "BIter %d, likelihood: %.4f\n", iter, likelihood); delta_likelihood = likelihood - old_likelihood; old_likelihood = likelihood; } /* while delta_likelihood */ m_free(smo_sqd); m_free(save_w); return 0; STOP: m_free(smo_sqd); m_free(save_w); return -1;#undef CUR_PROC} /* smixturehmm_cluster *//*============================================================================*//* Initial component probs. (cp) for each sequence; dimension cp: cp[sqd->seq_number] [smo_number] and (depending on cp): model priors for all models */int smixturehmm_init(double **cp, sequence_d_t *sqd, smodel **smo, int smo_number, int mode) {#define CUR_PROC "smixturehmm_init" int i, j; double p; double *result; char *str; int bm; for (i = 0; i < sqd->seq_number; i++) for (j = 0; j < smo_number; j++) cp[i][j] = 0.0; if (mode < 1 || mode > 5) { mes_prot("Error: initial mode out of range\n"); goto STOP; } /* 1. strict random partition; cp = 0/1 */ if (mode == 1) { for (i = 0; i < sqd->seq_number; i++) { p = gsl_rng_uniform(RNG); j = (int) floor(smo_number * p); /* ??? */ if (j < 0 || j >= smo_number) { mes_prot("Error: initial model out of range\n"); goto STOP; } cp[i] [j] = 1.0; } } /* 2. smap_bayes from initial models */ else if (mode == 2) { for (i = 0; i < sqd->seq_number; i++) if (smap_bayes(smo, cp[i], smo_number, sqd->seq[i], sqd->seq_len[i]) == -1) { str = mprintf(NULL, 0, "Can't determine comp. prob for seq ID %.0f \n", sqd->seq_id[i]); mes_prot(str); m_free(str); /* goto STOP; */ } } /* another possibility: make partition with best model from smap_bayes... */ /* 3. cp = 1 for best model, cp = 0 for other models */ else if (mode == 3) { if(!m_calloc(result, smo_number)) {mes_proc(); goto STOP;} for (i = 0; i < sqd->seq_number; i++) { bm = smap_bayes(smo, result, smo_number, sqd->seq[i], sqd->seq_len[i]); if (bm == -1) { str = mprintf(NULL, 0, "Can't determine comp. prob for seq ID %.0f \n", sqd->seq_id[i]); mes_prot(str); m_free(str); /* goto STOP; */ } cp[i][bm] = 1.0; } m_free(result); } /* mode == 4 used to be kmeans labels */ /* 5. no start partition == equal cp for each model */ else if (mode == 5) { for (i = 0; i < sqd->seq_number; i++) for (j = 0; j < smo_number; j++) cp[i][j] = 1/(double)smo_number; } else { printf("Unknown Init Mode %d \n", mode); return -1; } return 0; STOP: return -1; #undef CUR_PROC} /* smixturehmm_compprob_init *//*============================================================================*//* currently not activated */# if 0int smixturehmm_calc_priors(double **cp, sequence_d_t *sqd, smodel **smo, int smo_number) {#define CUR_PROC "smixturehmm_calc_priors" int i, k; double sum; if (total_train_w == 0) { mes_prot("total_train_w == 0!\n"); goto STOP; } for (k = 0; k < smo_number; k++) { sum = 0.0; for (i = 0; i < sqd->seq_number; i++) sum += cp[i] [k] * sqd->seq_w[i]; smo[k]->prior = sum / total_train_w; printf("\nPriors[%d] is %.6f\n",k,smo[k]->prior); } return 0; STOP: return -1;#undef CUR_PROC} /* smixturehmm_calc_priors */#endif/*============================================================================*//* calculate component probs for all sequences and all components *//* also recalculate total_train_w; if a seq has cp = 0 don't add its weigth to total_train_w, otherwise errors in calculating model priors occur*/int smixturehmm_calc_cp(double **cp, sequence_d_t *sqd, smodel **smo, int smo_number, double *total_train_w) {#define CUR_PROC "smixturehmm_calc_cp" int i; char *str; double errorseqs = 0.0; *total_train_w = 0.0; for (i = 0; i < sqd->seq_number; i++) if (smap_bayes(smo, cp[i], smo_number, sqd->seq[i], sqd->seq_len[i]) == -1) { /* all cp[i] [ . ] are set to zero; seq. will be ignored for reestimation!!! */ str = mprintf(NULL, 0, "Warning[%d]: Can't determine comp. prob for seq ID %.0f\n", i ,sqd->seq_id[i]); mes_prot(str); m_free(str); errorseqs++; if (errorseqs > 0.1 * (double)sqd->seq_number) { printf("errorseqs %.1f, max false %.1f\n", errorseqs, 0.1 * (double)sqd->seq_number); mes_prot("max. no of errors from smap_bayes exceeded\n"); goto STOP; } } else *total_train_w += sqd->seq_w[i]; return 0; STOP: return -1;#undef CUR_PROC} /* smixturehmm_calc_cp *//*============================================================================*//* Is currently not used. Use this function in calc_cp and smixturehmm_like later on (saves half of the sfoba_logp() calls). Danger: Is it neccessary to take seq_w into account? */void smixture_calc_logp(double **logp, int **error, sequence_d_t *sqd, smodel **smo, int smo_number) { int i, k; for (i = 0; i < sqd->seq_number; i++) for (k = 0; k < smo_number; k++) { if (sfoba_logp(smo[k], sqd->seq[i], sqd->seq_len[i], &(logp[i][k])) == -1) error[i][k] = 1; else error[i][k] = 0; } }/*============================================================================*//* flag == 1 --> Header for .like-file, else --> other file */void smixturehmm_print_header(FILE *file, char *argv[], int flag) { time_t zeit; int mode = atoi(argv[9]); time(&zeit); fprintf(file, "\n************************************************************************\n"); fprintf(file, "Date: %ssmixturehmm:\n", ctime((const time_t *)&zeit)); fprintf(file, "Seq. File\t%s\nInit-model File\t%s\nInit-Mode\t", argv[1], argv[2]); switch (mode) { case 1: fprintf(file, "%d SP_ZUF (random start partition)\n", mode); break; case 2: fprintf(file, "%d SP_VERT (distr. accord smap_bayes)\n", mode); break; case 3: fprintf(file, "%d SP_BEST (best model)\n", mode); break; case 4: fprintf(file, "%d SP_KM (partition from k-means)\n", mode); break; case 5: fprintf(file, "%d NO_SP (no start partition)\n", mode); break; default: fprintf(file, "???\n"); break; } fprintf(file, "Train Ratio\t %.4f\n\n", atof(argv[7])); if (flag ==1) /* only in .like-File */ fprintf(file, "smo no.\tCV Iter\t SeqTrain\tSeqTest\tLikeTrain\tLikeTest\tavrgTrain\tavrgTest\tErrorTrain\tErrorTest\tBIC\n");}/*============================================================================*//* calculates for each model the average likelihood per seq. weight: avg_like[j] = sum_i (cp[i][j] * seq_w[i] * log_p(i | j) ) / sum_i (cp[i][j] * seq_w[i]). Usefull for identifying models with poor likelihood*/double *smixturehmm_avg_like(double **cp, sequence_d_t *sqd, smodel **smo, int smo_number) {#define CUR_PROC "smixturehmm_avg_like" double *avg_like = NULL; int i, k; double num = 0.0, denom = 0.0, log_p = 0.0; if (!m_calloc(avg_like, smo_number)) {mes_proc(); goto STOP;} for (k = 0; k < smo_number; k++) { num = denom = 0.0; for (i = 0; i < sqd->seq_number; i++) { if (sfoba_logp(smo[k], sqd->seq[i], sqd->seq_len[i], &log_p) != -1) { num += cp[i][k] * sqd->seq_w[i] * log_p; denom += cp[i][k] * sqd->seq_w[i]; } } if (denom > 0) avg_like[k] = num / denom; else avg_like[k] = -1; } /* for models k ...*/ return avg_like; STOP: return NULL;#undef CUR_PROC} /* smixturehmm_avg_like */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -