📄 set_profile_hmm_parameters.c
字号:
} else if(last_sign == INSERT_ACID) { cur_sign = INSERT_ACID; } else if(last_sign == GAP) { cur_sign = GAP; } is_gap = YES; cur_msa_column++; } } else if(cur == ';') { } else /* ACID */{ last_sign = cur_sign; if(cur_msa_column == cur_mtx_col_nr) /* match */{ if(last_sign == ACID || last_sign == NEW) { cur_sign = ACID; *(transitions + get_mtx_index(1,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == GAP) { cur_sign = ACID; *(transitions + get_mtx_index(5,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == INSERT_ACID) { cur_sign = ACID; *(transitions + get_mtx_index(7,cur_mtx_column,nr_mtx_columns)) += 1; } if(is_gap == YES) { is_gap = NO; cur_insert_column++; } cur_msa_column++; cur_mtx_column++; } else if(cur_msa_column < cur_mtx_col_nr) /* insert */{ if(last_sign == ACID || last_sign == NEW) { cur_sign = INSERT_ACID; *(transitions + get_mtx_index(3,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == INSERT_ACID) { cur_sign = INSERT_ACID; *(transitions + get_mtx_index(6,cur_mtx_column,nr_mtx_columns)) += 1; } else if(last_sign == GAP) { cur_sign = INSERT_ACID; } is_gap = YES; cur_msa_column++; } } } } //printf(check 6\n"); /* check that msa length and nr columns match */ if((nr_mtx_columns - 1)*3 + 4 != hmm.nr_v) { printf("Warning: length of hmm is not equal to length of profile\n"); } else { /* start-transitions */ s_tot = *(transitions + get_mtx_index(1,0,nr_mtx_columns)) + *(transitions + get_mtx_index(2,0,nr_mtx_columns)); sd = *(transitions + get_mtx_index(2,0,nr_mtx_columns)); sm = *(transitions + get_mtx_index(1,0,nr_mtx_columns)); *(hmm.transitions + get_mtx_index(0, 3, hmm.nr_v)) = SI_PROB; *(hmm.transitions + get_mtx_index(0, 1, hmm.nr_v)) = (double)sd / (double)s_tot *(1.0 - SI_PROB); *(hmm.transitions + get_mtx_index(0, 2, hmm.nr_v)) = (double)sm / (double)s_tot *(1.0 - SI_PROB); /* for all dd-trans and dm-trans, calculate values */ for(i = 1; i < nr_mtx_columns - 1; i++) { d_tot = *(transitions + get_mtx_index(4,i,nr_mtx_columns)) + *(transitions + get_mtx_index(5,i,nr_mtx_columns)); dd = *(transitions + get_mtx_index(4,i,nr_mtx_columns)); dm = *(transitions + get_mtx_index(5,i,nr_mtx_columns)); *(hmm.transitions + get_mtx_index(3 * (i-1) + 1, 3 * i + 1, hmm.nr_v)) = (double)dd / (double)d_tot; *(hmm.transitions + get_mtx_index(3 * (i-1) + 1, 3 * i + 2, hmm.nr_v)) = (double)dm / (double)d_tot; } /* for all mm-trans, calculate values */ /* for all md-trans, calculate values */ /* for all mi-trans, calculate values */ for(i = 1; i < nr_mtx_columns - 1; i++) { m_tot = *(transitions + get_mtx_index(1,i,nr_mtx_columns)) + *(transitions + get_mtx_index(2,i,nr_mtx_columns)) + *(transitions + get_mtx_index(3,i,nr_mtx_columns)); mm = *(transitions + get_mtx_index(1,i,nr_mtx_columns)); md = *(transitions + get_mtx_index(2,i,nr_mtx_columns)); mi = *(transitions + get_mtx_index(3,i,nr_mtx_columns)); *(hmm.transitions + get_mtx_index(3 * (i-1) + 2, 3 * i + 2, hmm.nr_v)) = (double)mm / (double)m_tot; *(hmm.transitions + get_mtx_index(3 * (i-1) + 2, 3 * i + 1, hmm.nr_v)) = (double)md / (double)m_tot; *(hmm.transitions + get_mtx_index(3 * (i-1) + 2, 3 * i + 3, hmm.nr_v)) = (double)mi / (double)m_tot; } *(hmm.transitions + get_mtx_index((nr_mtx_columns-2)*3 + 2, (nr_mtx_columns-1)*3 + 2, hmm.nr_v)) = 1.0 - EI_PROB;; *(hmm.transitions + get_mtx_index((nr_mtx_columns-2)*3 + 2, (nr_mtx_columns-1)*3 + 1, hmm.nr_v)) = EI_PROB; /* for all ii-trans, calculate values */ /* for all im-trans, calculate values */ for(i = 1; i < nr_mtx_columns; i++) { i_tot = *(transitions + get_mtx_index(6,i,nr_mtx_columns)) + *(transitions + get_mtx_index(7,i,nr_mtx_columns)); ii = *(transitions + get_mtx_index(6,i,nr_mtx_columns)); im = *(transitions + get_mtx_index(7,i,nr_mtx_columns)); *(hmm.transitions + get_mtx_index(3 * (i-1) + 3, 3 * i - 1 , hmm.nr_v)) = (double)ii / (double)i_tot; *(hmm.transitions + get_mtx_index(3 * (i-1) + 3, 3 * (i-1) + 3, hmm.nr_v)) = (double)im / (double)i_tot; } transprob_ii = exp(log(0.5) / INSERT_SIZE); *(hmm.transitions + get_mtx_index(3, 2, hmm.nr_v)) = 1.0 - transprob_ii; *(hmm.transitions + get_mtx_index(3, 3, hmm.nr_v)) = transprob_ii; *(hmm.transitions + get_mtx_index(nr_mtx_columns*3 - 2, nr_mtx_columns*3 - 1, hmm.nr_v)) = 1.0 - transprob_ii; *(hmm.transitions + get_mtx_index(nr_mtx_columns*3 - 2, nr_mtx_columns*3 - 2, hmm.nr_v)) = transprob_ii; } /* recalculate insert-emissions for columns where alignment give a hint to the distribution */ if(use_insert_matrix == YES) { rewind(seq_infile); while((fgets(row, tot_nr_chars, seq_infile)) != NULL) { i = 0; j = 0; while(1) { if(row[2*i+1] == '>') { break; } if(i >= (*(transitions + j)) - 1) { j++; i++; continue; } else if(row[2*i+1] == ' ' || row[2*i+1] == '_' || row[2*i+1] == '.' || row[2*i+1] == '-') { i++; } else { if((a_index = get_alphabet_index_single(hmm.alphabet, row[2*i+1], hmm.a_size)) < 0) { if((a_index = get_replacement_letter_index_single(&(row[2*i+1]), &replacement_letters)) < 0) { i++; } else { printf("letter '%c' is not in alphabet\n",row[2*i+1]); exit(0); } } else { *(insert_emissions + get_mtx_index(j, a_index, hmm.a_size+1)) += 1.0; *(insert_emissions + get_mtx_index(j, hmm.a_size, hmm.a_size+1)) += 1.0; i++; } } } } for(i = 1; i < nr_mtx_columns; i++) { priorize_shares(i, insert_emissions, prior_scaler); for(j = 0; j < hmm.a_size; j++) { *(hmm.emissions + get_mtx_index(i*3,j,hmm.a_size)) = *(insert_emissions + get_mtx_index(i-1,j,hmm.a_size+1)); } } } /* cleanup and return */ free(transitions); free(insert_emissions); free(row); return;}void add_replacement_letter(char cur){}int read_priorfile(FILE *priorfile){ int j,k; double q_value, alpha_value, alpha_sum, logbeta; char s[2048]; char ps[2048]; char *file_name; char *pri; rewind(priorfile); /* put default name */ strcpy(em_di.name, "default"); /* put nr of components in struct */ if(fgets(ps, 2048, priorfile) != NULL) { } else { return -1; } while(*ps == '#') { if(fgets(ps, 2048, priorfile) != NULL) { } else { return -1; } } em_di.nr_components = atoi(&ps[0]); /* read 2 empty lines */ if(fgets(ps, 2048, priorfile) != NULL) { } if(fgets(ps, 2048, priorfile) != NULL) { } /* allocate memory for arrays and matrix to this prior struct */ em_di.q_values = malloc_or_die(em_di.nr_components * sizeof(double)); em_di.alpha_sums = malloc_or_die(em_di.nr_components * sizeof(double)); em_di.logbeta_values = malloc_or_die(em_di.nr_components * sizeof(double)); em_di.prior_values = malloc_or_die(em_di.nr_components * hmm.a_size * sizeof(double)); for(j = 0; j < em_di.nr_components; j++) { /* put q-value in array */ if(fgets(ps, 2048, priorfile) != NULL) { q_value = atof(&ps[0]); *(em_di.q_values + j) = q_value; #ifdef DEBUG_PRI printf("q_value = %f\n", *(em_di.q_values + j));#endif } /* put alpha-values of this component in matrix */ alpha_sum = 0.0; k = 0; if(fgets(ps, 2048, priorfile) != NULL) { pri = &ps[0]; for(k = 0; k < hmm.a_size; k++) { alpha_value = strtod(pri, &pri); alpha_sum += alpha_value; *((em_di.prior_values) + get_mtx_index(j, k, hmm.a_size)) = alpha_value; } } /* put sum of alphavalues in array */ *((em_di.alpha_sums) + j) = alpha_sum; /* calculate logB(alpha) for this compoment, store in array*/ logbeta = 0; for(k = 0; k < hmm.a_size; k++) { logbeta += lgamma(*(em_di.prior_values + get_mtx_index(j, k, hmm.a_size))); #ifdef DEBUG_PRI printf("prior_value = %f\n", *((em_di.prior_values) + get_mtx_index(j, k, hmm.a_size))); printf("lgamma_value = %f\n", lgamma(*((em_di.prior_values) + get_mtx_index(j, k, hmm.a_size))));#endif } logbeta = logbeta - lgamma(*(em_di.alpha_sums + j)); *(em_di.logbeta_values + j) = logbeta; /* read 3 empty lines */ if(fgets(ps, 2048, priorfile) != NULL) { } if(fgets(ps, 2048, priorfile) != NULL) { } if(fgets(ps, 2048, priorfile) != NULL) { } } #ifdef DEBUG_PRI dump_prior_struct(em_di); #endif}int priorize_shares(int pos, double *insert_emissions, int prior_scaler){ int nr_components, comps, a_index; double occ_sums; double q_value, scaling_factor, X_sum, *X_values, ed_res1, *logbeta_an_values; double exponent, prior_prob, tot_prior_prob; /************the update part ********************/ nr_components = em_di.nr_components; logbeta_an_values = malloc_or_die(nr_components * sizeof(double)); scaling_factor = -FLT_MAX; X_sum = 0.0; X_values = malloc_or_die(hmm.a_size * sizeof(double)); /* calculate logB(alpha + n) for all components + * calculate scaling factor for logB(alpha + n) - logB(alpha) */ for(comps = 0; comps < nr_components; comps++) { ed_res1 = 0; occ_sums = 0; for(a_index = 0; a_index < hmm.a_size; a_index++) { ed_res1 += lgamma(*(em_di.prior_values + get_mtx_index(comps, a_index, hmm.a_size)) + *(insert_emissions + get_mtx_index(pos-1,a_index,hmm.a_size+1))); occ_sums += *(insert_emissions + get_mtx_index(pos-1,a_index,hmm.a_size+1)); } ed_res1 = ed_res1 - lgamma(*(em_di.alpha_sums + comps) + (double)(occ_sums)); *(logbeta_an_values + comps) = ed_res1; if((ed_res1 = ed_res1 - *(em_di.logbeta_values + comps)) > scaling_factor) { scaling_factor = ed_res1; } } /* calculate all the Xi's */ for(a_index = 0; a_index < hmm.a_size; a_index++) { *(X_values + a_index) = 0; for(comps = 0; comps < nr_components; comps++) { q_value = *(em_di.q_values + comps); exponent = (*(logbeta_an_values + comps) - *(em_di.logbeta_values + comps) - scaling_factor); prior_prob = (*(em_di.prior_values + get_mtx_index(comps,a_index, hmm.a_size)) * (double)(prior_scaler) + *(insert_emissions + get_mtx_index(pos-1,a_index,hmm.a_size+1))); tot_prior_prob = (*(em_di.alpha_sums + comps) + (double)(occ_sums)); *(X_values + a_index) += q_value * exp(exponent) * prior_prob / tot_prior_prob;#ifdef DEBUG_PRI printf("\nscaling factor = %f\n", scaling_factor); printf("a_index = %d\n", a_index); printf("q_value = %f\n", q_value); printf("exponent = %f\n", exponent); printf("prior_prob = %f\n", prior_prob); printf("tot_prior_prob = %f\n\n", tot_prior_prob);#endif } X_sum += *(X_values + a_index); } /* update share values */ for(a_index = 0; a_index < hmm.a_size; a_index++) { ed_res1 = *(X_values + a_index) / X_sum; if(ed_res1 != 0.0) { *(insert_emissions + get_mtx_index(pos-1,a_index,hmm.a_size+1)) = ed_res1; } else { *(insert_emissions + get_mtx_index(pos-1,a_index,hmm.a_size+1)) = ed_res1; } } /* cleanup */ free(logbeta_an_values); free(X_values);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -