sequence.c
来自「General Hidden Markov Model Library 一个通用」· C语言 代码 · 共 1,678 行 · 第 1/4 页
C
1,678 行
sqd->seq_label[j] = sq->seq_label[j];#endif /* GHMM_OBSOLETE */ sqd->seq_id[j] = sq->seq_id[j]; sqd->seq_w[j] = sq->seq_w[j]; } sqd->seq_number = sq->seq_number; sqd->total_w = sq->total_w; return (sqd);STOP: /* Label STOP from ARRAY_[CM]ALLOC */ ghmm_cseq_free (&sqd); return NULL;#undef CUR_PROC} /* ghmm_cseq_create_from_dseq *//*============================================================================*/ghmm_dseq *ghmm_dseq_create_from_cseq (const ghmm_cseq * sqd){# define CUR_PROC "ghmm_dseq_create_from_cseq" int j, i; ghmm_dseq *sq = NULL; /* target seq. array */ if (!(sq = ghmm_dseq_calloc (sqd->seq_number))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } for (j = 0; j < sqd->seq_number; j++) { ARRAY_CALLOC (sq->seq[j], sqd->seq_len[j]); for (i = 0; i < sqd->seq_len[j]; i++) { sq->seq[j][i] = m_int (fabs (sqd->seq[j][i])); } sq->seq_len[j] = sqd->seq_len[j];#ifdef GHMM_OBSOLETE sq->seq_label[j] = sqd->seq_label[j];#endif /* GHMM_OBSOLETE */ sq->seq_id[j] = sqd->seq_id[j]; sq->seq_w[j] = sqd->seq_w[j]; } sq->seq_number = sqd->seq_number; sq->total_w = sqd->total_w; return (sq);STOP: /* Label STOP from ARRAY_[CM]ALLOC */ ghmm_dseq_free (&sq); return NULL;#undef CUR_PROC} /* ghmm_dseq_create_from_cseq *//*============================================================================*/int ghmm_dseq_max_len (const ghmm_dseq * sqd){ int i, max_len = 0; for (i = 0; i < sqd->seq_number; i++) if (max_len < sqd->seq_len[i]) max_len = sqd->seq_len[i]; return max_len;} /* ghmm_dseq_max_len *//*============================================================================*/int ghmm_cseq_max_len (const ghmm_cseq * sqd){ int i, max_len = 0; for (i = 0; i < sqd->seq_number; i++) if (max_len < sqd->seq_len[i]) max_len = sqd->seq_len[i]; return max_len;} /* ghmm_cseq_max_len *//*============================================================================*/ghmm_cseq *ghmm_cseq_mean (const ghmm_cseq * sqd){# define CUR_PROC "ghmm_cseq_mean" int i, j, max_len; ghmm_cseq *out_sqd = NULL; max_len = ghmm_cseq_max_len (sqd); if (!(out_sqd = ghmm_cseq_calloc (1))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } ARRAY_CALLOC (out_sqd->seq[0], max_len); out_sqd->seq_len[0] = max_len; for (i = 0; i < sqd->seq_number; i++) for (j = 0; j < sqd->seq_len[i]; j++) out_sqd->seq[0][j] += sqd->seq[i][j]; for (j = 0; j < max_len; j++) out_sqd->seq[0][j] /= sqd->seq_number; return out_sqd;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ ghmm_cseq_free (&out_sqd); return NULL;# undef CUR_PROC} /* ghmm_cseq_mean *//*============================================================================*/double **ghmm_cseq_scatter_matrix (const ghmm_cseq * sqd, int *dim){# define CUR_PROC "ghmm_cseq_scatter_matrix" int *count, k, l, i, j; double **W, *mean; *dim = ghmm_cseq_max_len (sqd); if (!(W = ighmm_cmatrix_alloc (*dim, *dim))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } /* Mean over all sequences. Individual counts for each dimension */ ARRAY_CALLOC (mean, *dim); ARRAY_CALLOC (count, *dim); for (i = 0; i < sqd->seq_number; i++) { for (l = 0; l < sqd->seq_len[i]; l++) { mean[l] += sqd->seq[i][l]; count[l]++; } } for (l = 0; l < *dim; l++) mean[l] /= count[l]; /* scatter matrix (upper triangle) */ for (j = 0; j < sqd->seq_number; j++) { for (k = 0; k < *dim; k++) { for (l = k; l < *dim; l++) { if (sqd->seq_len[j] > l) W[k][l] += (sqd->seq[j][k] - mean[k]) * (sqd->seq[j][l] - mean[l]); } } } /* norm with counts, set lower triangle */ for (k = 0; k < *dim; k++) { for (l = *dim - 1; l >= 0; l--) { if (l >= k) W[k][l] /= (double) count[l]; else W[k][l] = W[l][k]; } } return W;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ ighmm_cmatrix_free (&W, *dim); return NULL;# undef CUR_PROC} /* ghmm_cseq_scatter_matrix *//*============================================================================*//* dummy function at the moment */int ghmm_cseq_class (const double *O, int index, double *osum){#define CUR_PROC "ghmm_cseq_class" return 0;# undef CUR_PROC} /* ghmm_cseq_class *//*============================================================================*//* divide given field of seqs. randomly into two different fields. Also do allocating. train_ratio determines approx. the fraction of seqs. that go into the train_set and test_set resp.*/int ghmm_cseq_partition (ghmm_cseq * sqd, ghmm_cseq * sqd_train, ghmm_cseq * sqd_test, double train_ratio){#define CUR_PROC "ghmm_cseq_partition" double p; ghmm_cseq *sqd_dummy = NULL; int i; long total_seqs, cur_number; total_seqs = sqd->seq_number; if (total_seqs <= 0) { GHMM_LOG(LCONVERTED, "Error: number of seqs. less or equal zero\n"); goto STOP; } /* waste of memory but avoids to many reallocations */ sqd_dummy = sqd_train; for (i = 0; i < 2; i++) { ARRAY_CALLOC (sqd_dummy->seq, total_seqs); ARRAY_CALLOC (sqd_dummy->seq_len, total_seqs);#ifdef GHMM_OBSOLETE ARRAY_CALLOC (sqd_dummy->seq_label, total_seqs);#endif /* GHMM_OBSOLETE */ ARRAY_CALLOC (sqd_dummy->seq_id, total_seqs); ARRAY_CALLOC (sqd_dummy->seq_w, total_seqs); sqd_dummy->seq_number = 0; sqd_dummy = sqd_test; } for (i = 0; i < total_seqs; i++) { p = GHMM_RNG_UNIFORM (RNG); if (p <= train_ratio) sqd_dummy = sqd_train; else sqd_dummy = sqd_test; cur_number = sqd_dummy->seq_number; ARRAY_CALLOC (sqd_dummy->seq[cur_number], sqd->seq_len[i]); /* copy all entries */ ghmm_cseq_copy_all (sqd_dummy, cur_number, sqd, i); sqd_dummy->seq_number++; } /* reallocs */ sqd_dummy = sqd_train; for (i = 0; i < 2; i++) { ARRAY_REALLOC (sqd_dummy->seq, sqd_dummy->seq_number); ARRAY_REALLOC (sqd_dummy->seq_len, sqd_dummy->seq_number);#ifdef GHMM_OBSOLETE ARRAY_REALLOC (sqd_dummy->seq_label, sqd_dummy->seq_number);#endif /* GHMM_OBSOLETE */ ARRAY_REALLOC (sqd_dummy->seq_id, sqd_dummy->seq_number); ARRAY_REALLOC (sqd_dummy->seq_w, sqd_dummy->seq_number); sqd_dummy = sqd_test; } return 0;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ return -1;#undef CUR_PROC}/*============================================================================*/void ghmm_cseq_copy_all (ghmm_cseq * target, long t_num, ghmm_cseq * source, long s_num){ ghmm_cseq_copy (target->seq[t_num], source->seq[s_num], source->seq_len[s_num]); target->seq_len[t_num] = source->seq_len[s_num];#ifdef GHMM_OBSOLETE target->seq_label[t_num] = source->seq_label[s_num];#endif /* GHMM_OBSOLETE */ target->seq_id[t_num] = source->seq_id[s_num]; target->seq_w[t_num] = source->seq_w[s_num];}/*============================================================================*//* Likelihood function in a mixture model: sum_k w^k log( sum_c (alpha_c p(O^k | lambda_c)))*/int ghmm_cseq_mix_like (ghmm_cmodel ** smo, int smo_number, ghmm_cseq * sqd, double *like){#define CUR_PROC "ghmm_cseq_mix_like" int i, k, error_seqs = 0; double seq_like = 0.0, log_p; *like = 0.0; for (i = 0; i < sqd->seq_number; i++) { seq_like = 0.0; for (k = 0; k < smo_number; k++) { if (ghmm_cmodel_logp (smo[k], sqd->seq[i], sqd->seq_len[i], &log_p) != -1) { if (log_p > -100) seq_like += exp (log_p) * smo[k]->prior; } } /* no model fits */ if (seq_like == 0.0) { error_seqs++; *like += (GHMM_PENALTY_LOGP * sqd->seq_w[i]); } else *like += (log (seq_like) * sqd->seq_w[i]); } return error_seqs;#undef CUR_PROC} /* ghmm_cseq_mix_like */static int *preproccess_alphabet(ghmm_alphabet *a) {#define CUR_PROC "preprocess_alphabet" unsigned int i; int *l = malloc(128*sizeof(*l)); memset(l, -1, 128*sizeof(*l)); for (i=0; i<a->size; ++i) { if (strlen(a->symbols[i]) == 1) { l[a->symbols[i][0]] = i; } else { GHMM_LOG(LERROR, "invalid alphabet for ghmm_dseq_open_fasta"); free(l); return NULL; } } return l;#undef CUR_PROC}static int get_internal(int *lookup, char x) { if (x>=0 && x < 128) return lookup[x]; else { return -1; }}/*============================================================================*/ghmm_dseq *ghmm_dseq_open_fasta(const char *filename, ghmm_alphabet *alphabet) {#define CUR_PROC "ghmm_dseq_open_fasta" char *line, *input, *last_header; int seqlen, linelen, symbol, i; int pos=0, seq_nr=0, header_cont=0, skip_seq=1; int *lookup, *sequences, *curseq_ptr; FILE *fa = NULL; struct stat sb; ghmm_dseq *seqs = ghmm_dseq_calloc(50); if (!seqs) goto STOP; ARRAY_MALLOC(line, 121); ARRAY_MALLOC(last_header, 121); if (!(fa = fopen(filename, "r"))) { GHMM_LOG_PRINTF(LERROR, LOC, "can't open FastA file %s", filename); goto STOP; } /* determine filesize to allocate the buffer for the sequnce data */ if (stat(filename, &sb)) { goto STOP; } ARRAY_MALLOC(sequences, sb.st_size); /* set flag indicating that the sequences are all in one array */ seqs->flags |= kBlockAllocation; curseq_ptr = sequences; if (!(lookup = preproccess_alphabet(alphabet))) goto STOP; while ((input = fgets(line, 120, fa))) { linelen = strlen(input); /* found a sequnce identifier, start next */ if (input[0] == '>' && !header_cont) { if (curseq_ptr && !skip_seq) { if (seqs->capacity <= seq_nr && ghmm_dseq_realloc(seqs, (seqs->capacity)*2)) { GHMM_LOG(LERROR, "reallocation failed"); goto STOP; } seqs->seq[seq_nr] = curseq_ptr; seqs->seq_len[seq_nr] = seqlen; curseq_ptr = sequences + pos; seq_nr++; } skip_seq = 0; seqlen = 0; strncpy(last_header, input+1, m_min(65, linelen-2)); /* check whether the header was completed */ if (input[linelen-1] != '\n') header_cont = 1; } /* header continuation */ else if (header_cont) { header_cont = 0; if (input[linelen-1] != '\n') header_cont = 1; } /* reading sequence data */ else if (!skip_seq) { if (input[linelen-1] == '\n') linelen--; for (i=0; i<linelen; ++i) { if (0 > (symbol = get_internal(lookup, input[i]))) { GHMM_LOG_PRINTF(LWARN, LOC, "Invalid char %c in sequence \"%s ...\" ignoring it", input[i], last_header); skip_seq = 1; pos -= (seqlen + i); goto NEXT_LINE; } sequences[pos++] = symbol; } seqlen += linelen; } NEXT_LINE: continue; } if (curseq_ptr) { if (ghmm_dseq_realloc(seqs, seq_nr+1)) { GHMM_LOG(LERROR, "reallocation failed"); goto STOP; } seqs->seq[seq_nr] = curseq_ptr; seqs->seq_len[seq_nr] = seqlen; seqs->seq_number = seq_nr+1; } fclose(fa); free(lookup); return seqs;STOP: fclose(fa); ghmm_dseq_free(&seqs); free(sequences); free(line); free(lookup); return NULL;#undef CUR_PROC}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?