sequence.c

来自「General Hidden Markov Model Library 一个通用」· C语言 代码 · 共 1,678 行 · 第 1/4 页

C
1,678
字号
    sqd->seq_label[j] = sq->seq_label[j];#endif /* GHMM_OBSOLETE */    sqd->seq_id[j] = sq->seq_id[j];    sqd->seq_w[j] = sq->seq_w[j];  }  sqd->seq_number = sq->seq_number;  sqd->total_w = sq->total_w;  return (sqd);STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  ghmm_cseq_free (&sqd);  return NULL;#undef CUR_PROC}                               /* ghmm_cseq_create_from_dseq *//*============================================================================*/ghmm_dseq *ghmm_dseq_create_from_cseq (const ghmm_cseq * sqd){# define CUR_PROC "ghmm_dseq_create_from_cseq"  int j, i;  ghmm_dseq *sq = NULL;        /* target seq. array */  if (!(sq = ghmm_dseq_calloc (sqd->seq_number))) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  for (j = 0; j < sqd->seq_number; j++) {    ARRAY_CALLOC (sq->seq[j], sqd->seq_len[j]);    for (i = 0; i < sqd->seq_len[j]; i++) {      sq->seq[j][i] = m_int (fabs (sqd->seq[j][i]));    }    sq->seq_len[j] = sqd->seq_len[j];#ifdef GHMM_OBSOLETE    sq->seq_label[j] = sqd->seq_label[j];#endif /* GHMM_OBSOLETE */    sq->seq_id[j] = sqd->seq_id[j];    sq->seq_w[j] = sqd->seq_w[j];  }  sq->seq_number = sqd->seq_number;  sq->total_w = sqd->total_w;  return (sq);STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  ghmm_dseq_free (&sq);  return NULL;#undef CUR_PROC}                               /* ghmm_dseq_create_from_cseq *//*============================================================================*/int ghmm_dseq_max_len (const ghmm_dseq * sqd){  int i, max_len = 0;  for (i = 0; i < sqd->seq_number; i++)    if (max_len < sqd->seq_len[i])      max_len = sqd->seq_len[i];  return max_len;}                               /* ghmm_dseq_max_len *//*============================================================================*/int ghmm_cseq_max_len (const ghmm_cseq * sqd){  int i, max_len = 0;  for (i = 0; i < sqd->seq_number; i++)    if (max_len < sqd->seq_len[i])      max_len = sqd->seq_len[i];  return max_len;}                               /* ghmm_cseq_max_len *//*============================================================================*/ghmm_cseq *ghmm_cseq_mean (const ghmm_cseq * sqd){# define CUR_PROC "ghmm_cseq_mean"  int i, j, max_len;  ghmm_cseq *out_sqd = NULL;  max_len = ghmm_cseq_max_len (sqd);  if (!(out_sqd = ghmm_cseq_calloc (1))) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  ARRAY_CALLOC (out_sqd->seq[0], max_len);  out_sqd->seq_len[0] = max_len;  for (i = 0; i < sqd->seq_number; i++)    for (j = 0; j < sqd->seq_len[i]; j++)      out_sqd->seq[0][j] += sqd->seq[i][j];  for (j = 0; j < max_len; j++)    out_sqd->seq[0][j] /= sqd->seq_number;  return out_sqd;STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  ghmm_cseq_free (&out_sqd);  return NULL;# undef CUR_PROC}                               /* ghmm_cseq_mean *//*============================================================================*/double **ghmm_cseq_scatter_matrix (const ghmm_cseq * sqd, int *dim){# define CUR_PROC "ghmm_cseq_scatter_matrix"  int *count, k, l, i, j;  double **W, *mean;  *dim = ghmm_cseq_max_len (sqd);  if (!(W = ighmm_cmatrix_alloc (*dim, *dim))) {    GHMM_LOG_QUEUED(LCONVERTED);    goto STOP;  }  /* Mean over all sequences. Individual counts for each dimension */  ARRAY_CALLOC (mean, *dim);  ARRAY_CALLOC (count, *dim);  for (i = 0; i < sqd->seq_number; i++) {    for (l = 0; l < sqd->seq_len[i]; l++) {      mean[l] += sqd->seq[i][l];      count[l]++;    }  }  for (l = 0; l < *dim; l++)    mean[l] /= count[l];  /* scatter matrix (upper triangle) */  for (j = 0; j < sqd->seq_number; j++) {    for (k = 0; k < *dim; k++) {      for (l = k; l < *dim; l++) {        if (sqd->seq_len[j] > l)          W[k][l] += (sqd->seq[j][k] - mean[k]) * (sqd->seq[j][l] - mean[l]);      }    }  }  /* norm with counts, set lower triangle */  for (k = 0; k < *dim; k++) {    for (l = *dim - 1; l >= 0; l--) {      if (l >= k)        W[k][l] /= (double) count[l];      else        W[k][l] = W[l][k];    }  }  return W;STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  ighmm_cmatrix_free (&W, *dim);  return NULL;# undef CUR_PROC}                               /* ghmm_cseq_scatter_matrix *//*============================================================================*//* dummy function at the moment */int ghmm_cseq_class (const double *O, int index, double *osum){#define CUR_PROC "ghmm_cseq_class"  return 0;# undef CUR_PROC}                               /* ghmm_cseq_class *//*============================================================================*//* divide given field of seqs. randomly into two different fields. Also do    allocating. train_ratio determines approx. the fraction of seqs. that go   into the train_set and test_set resp.*/int ghmm_cseq_partition (ghmm_cseq * sqd, ghmm_cseq * sqd_train,                          ghmm_cseq * sqd_test, double train_ratio){#define CUR_PROC "ghmm_cseq_partition"  double p;  ghmm_cseq *sqd_dummy = NULL;  int i;  long total_seqs, cur_number;  total_seqs = sqd->seq_number;  if (total_seqs <= 0) {    GHMM_LOG(LCONVERTED, "Error: number of seqs. less or equal zero\n");    goto STOP;  }  /* waste of memory but avoids to many reallocations */  sqd_dummy = sqd_train;  for (i = 0; i < 2; i++) {    ARRAY_CALLOC (sqd_dummy->seq, total_seqs);    ARRAY_CALLOC (sqd_dummy->seq_len, total_seqs);#ifdef GHMM_OBSOLETE    ARRAY_CALLOC (sqd_dummy->seq_label, total_seqs);#endif /* GHMM_OBSOLETE */    ARRAY_CALLOC (sqd_dummy->seq_id, total_seqs);    ARRAY_CALLOC (sqd_dummy->seq_w, total_seqs);    sqd_dummy->seq_number = 0;    sqd_dummy = sqd_test;  }  for (i = 0; i < total_seqs; i++) {    p = GHMM_RNG_UNIFORM (RNG);    if (p <= train_ratio)      sqd_dummy = sqd_train;    else      sqd_dummy = sqd_test;    cur_number = sqd_dummy->seq_number;    ARRAY_CALLOC (sqd_dummy->seq[cur_number], sqd->seq_len[i]);    /* copy all entries */    ghmm_cseq_copy_all (sqd_dummy, cur_number, sqd, i);    sqd_dummy->seq_number++;  }  /* reallocs */  sqd_dummy = sqd_train;  for (i = 0; i < 2; i++) {    ARRAY_REALLOC (sqd_dummy->seq, sqd_dummy->seq_number);    ARRAY_REALLOC (sqd_dummy->seq_len, sqd_dummy->seq_number);#ifdef GHMM_OBSOLETE    ARRAY_REALLOC (sqd_dummy->seq_label, sqd_dummy->seq_number);#endif /* GHMM_OBSOLETE */    ARRAY_REALLOC (sqd_dummy->seq_id, sqd_dummy->seq_number);    ARRAY_REALLOC (sqd_dummy->seq_w, sqd_dummy->seq_number);    sqd_dummy = sqd_test;  }  return 0;STOP:     /* Label STOP from ARRAY_[CM]ALLOC */  return -1;#undef CUR_PROC}/*============================================================================*/void ghmm_cseq_copy_all (ghmm_cseq * target, long t_num,                          ghmm_cseq * source, long s_num){  ghmm_cseq_copy (target->seq[t_num], source->seq[s_num],                   source->seq_len[s_num]);  target->seq_len[t_num] = source->seq_len[s_num];#ifdef GHMM_OBSOLETE  target->seq_label[t_num] = source->seq_label[s_num];#endif /* GHMM_OBSOLETE */  target->seq_id[t_num] = source->seq_id[s_num];  target->seq_w[t_num] = source->seq_w[s_num];}/*============================================================================*//* Likelihood function in a mixture model:   sum_k w^k log( sum_c (alpha_c p(O^k | lambda_c)))*/int ghmm_cseq_mix_like (ghmm_cmodel ** smo, int smo_number, ghmm_cseq * sqd,                         double *like){#define CUR_PROC "ghmm_cseq_mix_like"  int i, k, error_seqs = 0;  double seq_like = 0.0, log_p;  *like = 0.0;  for (i = 0; i < sqd->seq_number; i++) {    seq_like = 0.0;    for (k = 0; k < smo_number; k++) {      if (ghmm_cmodel_logp (smo[k], sqd->seq[i], sqd->seq_len[i], &log_p) != -1) {        if (log_p > -100)          seq_like += exp (log_p) * smo[k]->prior;      }    }    /* no model fits */    if (seq_like == 0.0) {      error_seqs++;      *like += (GHMM_PENALTY_LOGP * sqd->seq_w[i]);    }    else      *like += (log (seq_like) * sqd->seq_w[i]);  }  return error_seqs;#undef CUR_PROC}                               /* ghmm_cseq_mix_like */static int *preproccess_alphabet(ghmm_alphabet *a) {#define CUR_PROC "preprocess_alphabet"    unsigned int i;    int *l = malloc(128*sizeof(*l));    memset(l, -1, 128*sizeof(*l));    for (i=0; i<a->size; ++i) {        if (strlen(a->symbols[i]) == 1) {            l[a->symbols[i][0]] = i;        }        else {            GHMM_LOG(LERROR, "invalid alphabet for ghmm_dseq_open_fasta");            free(l);            return NULL;        }    }    return l;#undef CUR_PROC}static int get_internal(int *lookup, char x) {    if (x>=0 && x < 128)        return lookup[x];    else {        return -1;    }}/*============================================================================*/ghmm_dseq *ghmm_dseq_open_fasta(const char *filename, ghmm_alphabet *alphabet) {#define CUR_PROC "ghmm_dseq_open_fasta"    char *line, *input, *last_header;    int seqlen, linelen, symbol, i;    int pos=0, seq_nr=0, header_cont=0, skip_seq=1;    int  *lookup, *sequences, *curseq_ptr;    FILE *fa = NULL;    struct stat sb;    ghmm_dseq *seqs = ghmm_dseq_calloc(50);    if (!seqs)        goto STOP;    ARRAY_MALLOC(line, 121);    ARRAY_MALLOC(last_header, 121);    if (!(fa = fopen(filename, "r"))) {        GHMM_LOG_PRINTF(LERROR, LOC, "can't open FastA file %s", filename);        goto STOP;    }    /* determine filesize to allocate the buffer for the sequnce data */    if (stat(filename, &sb)) {        goto STOP;    }    ARRAY_MALLOC(sequences, sb.st_size);    /* set flag indicating that the sequences are all in one array */    seqs->flags |= kBlockAllocation;    curseq_ptr = sequences;    if (!(lookup = preproccess_alphabet(alphabet)))        goto STOP;    while ((input = fgets(line, 120, fa))) {        linelen = strlen(input);        /* found a sequnce identifier, start next */        if (input[0] == '>' && !header_cont) {                    if (curseq_ptr && !skip_seq) {                if (seqs->capacity <= seq_nr &&                    ghmm_dseq_realloc(seqs, (seqs->capacity)*2))                {                    GHMM_LOG(LERROR, "reallocation failed");                    goto STOP;                }                seqs->seq[seq_nr] = curseq_ptr;                seqs->seq_len[seq_nr] = seqlen;                curseq_ptr = sequences + pos;                seq_nr++;            }            skip_seq = 0;            seqlen = 0;            strncpy(last_header, input+1, m_min(65, linelen-2));            /* check whether the header was completed */            if (input[linelen-1] != '\n')                header_cont = 1;        }        /* header continuation */        else if (header_cont) {            header_cont = 0;            if (input[linelen-1] != '\n')                header_cont = 1;        }        /* reading sequence data */        else if (!skip_seq) {            if (input[linelen-1] == '\n')                linelen--;            for (i=0; i<linelen; ++i) {                if (0 > (symbol = get_internal(lookup, input[i]))) {                    GHMM_LOG_PRINTF(LWARN, LOC,                                    "Invalid char %c in sequence \"%s ...\" ignoring it",                                    input[i], last_header);                    skip_seq = 1;                    pos -= (seqlen + i);                    goto NEXT_LINE;                }                sequences[pos++] = symbol;            }            seqlen += linelen;        }      NEXT_LINE:        continue;    }    if (curseq_ptr) {        if (ghmm_dseq_realloc(seqs, seq_nr+1))        {            GHMM_LOG(LERROR, "reallocation failed");            goto STOP;        }        seqs->seq[seq_nr] = curseq_ptr;        seqs->seq_len[seq_nr] = seqlen;        seqs->seq_number = seq_nr+1;    }        fclose(fa);    free(lookup);    return seqs;STOP:    fclose(fa);    ghmm_dseq_free(&seqs);    free(sequences);         free(line);    free(lookup);    return NULL;#undef CUR_PROC}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?