📄 initfa.c
字号:
}}doubleget_lambda(int **pam2p, int n0, int nsq, unsigned char *query) { double lambda, H; double *pr, tot, sum; int i, ioff, j, min, max; /* get min and max scores */ min = BIGNUM; max = -BIGNUM; if(pam2p[0][1] == -BIGNUM) { ioff = 1; n0++; } else { ioff = 0; } for (i = ioff ; i < n0 ; i++) { for (j = 1; j <= nsq ; j++) { if (min > pam2p[i][j]) min = pam2p[i][j]; if (max < pam2p[i][j]) max = pam2p[i][j]; } } /* fprintf(stderr, "min: %d\tmax:%d\n", min, max); */ if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) { fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1); exit(1); } tot = (double) rrtotal * (double) rrtotal * (double) n0; for (i = ioff ; i < n0 ; i++) { for (j = 1; j <= nsq ; j++) { pr[pam2p[i][j] - min] += (double) ((double) rrcounts[aascii[query[i]]] * (double) rrcounts[j]) / tot; } } sum = 0.0; for(i = 0 ; i <= max-min ; i++) { sum += pr[i]; /* fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */ } /* fprintf(stderr, "sum: %g\n", sum); */ for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; } if (!karlin(min, max, pr, &lambda, &H)) { fprintf(stderr, "Karlin lambda estimation failed\n"); } /* fprintf(stderr, "lambda: %g\n", lambda); */ free(pr); return lambda;}/* *aa0 - query sequence n0 - length pamscale - scaling for pam matrix - provided by apam.c, either 0.346574 = ln(2)/2 (P120, BL62) or 0.231049 = ln(2)/3 (P250, BL50) */voidscale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale);static unsigned char ustandard_aa[] ="\0ARNDCQEGHILKMFPSTWYV";voidread_pssm(unsigned char *aa0, int n0, int nsq, double pamscale, FILE *fp, int pgpf_type, struct pstruct *ppst) { int i, j, len, k; int qi, rj; /* qi - index query; rj - index residues (1-20) */ int **pam2p; int first, too_high; unsigned char *query, ctmp; char dline[512]; double freq, **freq2d, lambda, new_lambda; double scale, scale_high, scale_low; pam2p = ppst->pam2p[0]; if (pgpf_type == 0) { if(1 != fread(&len, sizeof(int), 1, fp)) { fprintf(stderr, "error reading from checkpoint file: %d\n", len); exit(1); } if(len != n0) { fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n", len,n0); exit(1); } /* read over query sequence stored in BLAST profile */ if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) { fprintf(stderr, "Couldn't allocate memory for query!\n"); exit(1); } if(len != fread(query, sizeof(char), len, fp)) { fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query); exit(1); } } else if (pgpf_type == 1) { if ((fgets(dline,sizeof(dline),fp) == NULL) || (1 != sscanf(dline, "%d",&len))) { fprintf(stderr, "error reading from checkpoint file: %d\n", len); exit(1); } if(len != n0) { fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n", len,n0); exit(1); } /* read over query sequence stored in BLAST profile */ if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) { fprintf(stderr, "Couldn't allocate memory for query!\n"); exit(1); } if (fgets((char *)query,len+2,fp)==NULL) { fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query); exit(1); } } else { fprintf(stderr," Unrecognized PSSM file type: %d\n",pgpf_type); exit(1); } /* currently we don't do anything with query; ideally, we should check to see that it actually matches aa0 ... */ /* quick 2d array alloc: */ if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) { fprintf(stderr, "Couldn't allocate memory for frequencies!\n"); exit(1); } if((freq2d[0] = (double *) calloc(n0 * 20, sizeof(double))) == NULL) { fprintf(stderr, "Couldn't allocate memory for frequencies!\n"); exit(1); } /* a little pointer arithmetic to fill out 2d array: */ for (i = 1 ; i < n0 ; i++) { freq2d[i] = freq2d[i-1] + 20; } if (pgpf_type == 0) { for (qi = 0 ; qi < n0 ; qi++) { for (rj = 0 ; rj < 20 ; rj++) { if(1 != fread(&freq, sizeof(double), 1, fp)) { fprintf(stderr, "Error while reading frequencies!\n"); exit(1); } freq2d[qi][rj] = freq; } } } else { for (qi = 0 ; qi < n0 ; qi++) { if ((fgets(dline,sizeof(dline),fp) ==NULL) || (k = sscanf(dline,"%c %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg\n", &ctmp, &freq2d[qi][0], &freq2d[qi][1], &freq2d[qi][2], &freq2d[qi][3], &freq2d[qi][4], &freq2d[qi][5], &freq2d[qi][6], &freq2d[qi][7], &freq2d[qi][8], &freq2d[qi][9], &freq2d[qi][10], &freq2d[qi][11], &freq2d[qi][12], &freq2d[qi][13], &freq2d[qi][14], &freq2d[qi][15], &freq2d[qi][16], &freq2d[qi][17], &freq2d[qi][18], &freq2d[qi][19]))<1) { fprintf(stderr, "Error while reading frequencies: %d read!\n",k); exit(1); } for (rj=0; rj<20; rj++) { freq2d[qi][rj] /= 10.0; } /* reverse scaling */ } } scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale); free(freq2d[0]); free(freq2d); free(query);}voidscale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale) { int i, qi, rj; double freq, new_lambda, lambda; int first, too_high; double scale, scale_high, scale_low; for (qi = 0 ; qi < n0 ; qi++) { for (rj = 0 ; rj < 20 ; rj++) { if (freq2d[qi][rj] > 1e-20) { freq = log(freq2d[qi][rj] /((double) (rrcounts[rj+1])/(double) rrtotal)); freq /= pamscale; /* this gets us close to originial pam scores */ freq2d[qi][rj] = freq; } else { /* when blastpgp decides to leave something out, it puts 0's in all the frequencies in the binary checkpoint file. In the ascii version, however, it uses BLOSUM62 values. I will put in scoring matrix values as well */ freq2d[qi][rj] = pam2[aascii[query[qi]]][rj+1]; } } } /* now figure out the right scale */ scale = 1.0; lambda = get_lambda(pam2, 20, 20, ustandard_aa); /* should be near 1.0 because of our initial scaling by ppst->pamscale */ /* fprintf(stderr, "real_lambda: %g\n", lambda); */ /* get initial high/low scale values: */ first = 1; while (1) { fill_pam(pssm2p, n0, 20, freq2d, scale); new_lambda = get_lambda(pssm2p, n0, 20, query); if (new_lambda > lambda) { if (first) { first = 0; scale = scale_high = 1.0 + 0.05; scale_low = 1.0; too_high = 1; } else { if (!too_high) break; scale = (scale_high += scale_high - 1.0); } } else if (new_lambda > 0) { if (first) { first = 0; scale_high = 1.0; scale = scale_low = 1.0 - 0.05; too_high = 0; } else { if (too_high) break; scale = (scale_low += scale_low - 1.0); } } else { fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda); exit(1); } } /* now do binary search between low and high */ for (i = 0 ; i < 10 ; i++) { scale = 0.5 * (scale_high + scale_low); fill_pam(pssm2p, n0, 20, freq2d, scale); new_lambda = get_lambda(pssm2p, n0, 20, query); if (new_lambda > lambda) scale_low = scale; else scale_high = scale; } scale = 0.5 * (scale_high + scale_low); fill_pam(pssm2p, n0, 20, freq2d, scale); /* fprintf(stderr, "final scale: %g\n", scale); for (qi = 0 ; qi < n0 ; qi++) { fprintf(stderr, "%4d %c: ", qi+1, query[qi]); for (rj = 1 ; rj <= 20 ; rj++) { fprintf(stderr, "%4d", pssm2p[qi][rj]); } fprintf(stderr, "\n"); } */}#if defined(CAN_PSSM)intparse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols, unsigned char **query, double ***freqs, char *matrix, int *gap_open, int *gap_extend, double *lambda);/* the ASN.1 pssm includes information about the scoring matrix used (though not the gap penalty in the current version PSSM:2) The PSSM scoring matrix and gap penalties should become the default if they have not been set explicitly.*//* read the PSSM from an open FILE *fp - but nothing has been read from *fp */intread_asn_pssm(unsigned char *aa0, int n0, int nsq, double pamscale, FILE *fp, struct pstruct *ppst) { int i, j, len, k; int qi, rj; /* qi - index query; rj - index residues (1-20) */ int **pam2p; int first, too_high; unsigned char *query, ctmp; char dline[512]; char matrix[MAX_SSTR]; double psi2_lambda; double freq, **freq2d, lambda, new_lambda; double scale, scale_high, scale_low; int gap_open, gap_extend; int n_rows, n_cols; pam2p = ppst->pam2p[0]; if (parse_pssm_asn_fa(fp, &n_rows, &n_cols, &query, &freq2d, matrix, &gap_open, &gap_extend, &psi2_lambda)<=0) { return -1; } if (!gap_set) { if (gap_open) { if (gap_open > 0) {gap_open = -gap_open;} ppst->gdelval = gap_open; } else if (strncmp(matrix,"BLOSUM62",8)==0) { ppst->gdelval = -11; } gap_set = 1; } if (!del_set) { if (gap_extend) { if (gap_extend > 0) {gap_extend = -gap_extend;} ppst->ggapval = gap_extend; } else if (strncmp(matrix,"BLOSUM62",8)==0) { ppst->ggapval = -1; } del_set = 1; } if (strncmp(matrix, "BLOSUM62", 8)== 0 && !ppst->pam_set) { strncpy(ppst->pamfile, "BL62", 120); standard_pam(ppst->pamfile,ppst,del_set, gap_set); if (!ppst->have_pam2) { alloc_pam (MAXSQ, MAXSQ, ppst); } init_pam2(ppst); ppst->pam_set = 1; } if (n_cols < n0) { fprintf(stderr, " query length: %d != n_cols: %d\n",n0, n_cols); exit(1); } scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale); free(freq2d[0]); free(freq2d); free(query); return 1;}#endifvoidlast_params(unsigned char *aa0, int n0, struct mngmsg *m_msg, struct pstruct *ppst#ifdef PCOMPLIB , struct qmng_str *qm_msg#endif ) { int i, nsq; FILE *fp; if (n0 < 0) { return;} ppst->n0 = m_msg->n0; if (ppst->ext_sq_set) { nsq = ppst->nsqx; } else {nsq = ppst->nsq;}#if defined(CAN_PSSM) ppst->pam2p[0] = alloc_pam2p(ppst->pam2p[0],n0,nsq); ppst->pam2p[1] = alloc_pam2p(ppst->pam2p[1],n0,nsq); if (ppst->pam_pssm) { if ((ppst->pgpfile_type == 0) && (fp=fopen(ppst->pgpfile,"rb"))) { read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 0, ppst); extend_pssm(aa0, n0, ppst); } else if ((ppst->pgpfile_type == 1) && (fp=fopen(ppst->pgpfile,"r"))) { read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 1, ppst); extend_pssm(aa0, n0, ppst); } else if ((ppst->pgpfile_type == 2) && (fp=fopen(ppst->pgpfile,"rb"))) { if (read_asn_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, ppst)>0) { extend_pssm(aa0, n0, ppst); } else { fprintf(stderr," Could not parse PSSM file: %s\n",ppst->pgpfile); ppst->pam_pssm = 0; return; } } else { fprintf(stderr," Could not open PSSM file: %s\n",ppst->pgpfile); ppst->pam_pssm = 0; return; } }#endif#if defined(FASTF) || defined(FASTS) || defined(FASTM) m_msg->nm0 = 1; for (i=0; i<n0; i++) if (aa0[i]==EOSEQ || aa0[i]==ESS) m_msg->nm0++;/* for FASTS, we can do statistics in one of two different ways if there are <= 10 query fragments, then we calculate probabilistic scores for every library sequence. If there are > 10 fragments, this takes much too long and too much memory, so we use the old fashioned raw score only z-score normalized method initially, and then calculate the probabilistic scores for the best hits. To scale those scores, we also need a set of random probabilistic scores. So we do the qshuffle to get them. For FASTF, precalculating probabilities is prohibitively expensive, so we never do it; FASTF always acts like FASTS with nfrags>10.*/#if defined(FASTS) || defined(FASTM) if (m_msg->nm0 > 10) m_msg->escore_flg = 0; else m_msg->escore_flg = 1;#endif if (m_msg->escore_flg && (ppst->zsflag&1)) { m_msg->last_calc_flg = 0; m_msg->qshuffle = 0; } else { /* need random query, second set of 2000 scores */ m_msg->last_calc_flg = 1; m_msg->qshuffle = 1; }#else#ifndef LALIGN m_msg->last_calc_flg = 0;#else m_msg->last_calc_flg = 1;#endif m_msg->qshuffle = 0; m_msg->escore_flg = 0; m_msg->nm0 = 1;#endif/* adjust the ktup if appropriate */ if (!ktup_set && pgm_def_arr[ppst->pgm_id].ktup > 0) { if (m_msg->qdnaseq == SEQT_PROT) { ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;#if defined(FASTS) || defined(FASTM) if (n0 > 100 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;#endif if (n0 < 40 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1; } else if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) { if (n0 < 20 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;#if defined(FASTS) || defined(FASTM) /* with the current (April 12 2005) dropfs2.c - ktup cannot be > 2 */ else ppst->param_u.fa.ktup = 2;#else else if (n0 < 50 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2; else if (n0 < 100 && ppst->param_u.fa.ktup > 3) ppst->param_u.fa.ktup = 3;#endif } }#ifdef PCOMPLIB qm_msg->nm0 = m_msg->nm0; qm_msg->escore_flg = m_msg->escore_flg; qm_msg->qshuffle = m_msg->qshuffle; qm_msg->pam_pssm = 0;#endif}/* given a good profile in ppst->pam2p[0], make an extended profile in ppst->pam2p[1]*/voidextend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst) { int i, j, nsq; int sa_x, sa_t, sa_b, sa_z, sa_j; int **pam2p0, **pam2p1; nsq = ppst->nsq; pam2p0 = ppst->pam2p[0]; pam2p1 = ppst->pam2p[1]; sa_x = pascii['X']; sa_t = pascii['*']; if (sa_t >= ppst->nsq) {sa_t = sa_x;} sa_b = pascii['B']; sa_z = pascii['Z']; sa_j = pascii['J']; /* fill in boundaries, B, Z, *, X */ for (i=0; i<n0; i++) { pam2p0[i][0] = -BIGNUM; pam2p0[i][sa_b] = (int) (((float)pam2p0[i][pascii['N']]+(float)pam2p0[i][pascii['D']]+0.5)/2.0); pam2p0[i][sa_z] = (int) (((float)pam2p0[i][pascii['Q']]+(float)pam2p0[i][pascii['E']]+0.5)/2.0); pam2p0[i][sa_j] = (int) (((float)pam2p0[i][pascii['I']]+(float)pam2p0[i][pascii['L']]+0.5)/2.0); pam2p0[i][sa_x] = ppst->pam_xm; pam2p0[i][sa_t] = ppst->pam_xm; } /* copy pam2p0 into pam2p1 */ for (i=0; i<n0; i++) { pam2p1[i][0] = -BIGNUM; for (j=1; j<=ppst->nsq; j++) { pam2p1[i][j] = pam2p0[i][j]; } } /* then fill in extended characters, if necessary */ if (ppst->ext_sq_set) { for (i=0; i<n0; i++) { for (j=1; j<=ppst->nsq; j++) { pam2p0[i][nsq+j] = pam2p0[i][j]; pam2p1[i][nsq+j] = ppst->pam_xm; } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -