⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 initfa.c

📁 序列对齐 Compare a protein sequence to a protein sequence database or a DNA sequence to a DNA sequenc
💻 C
📖 第 1 页 / 共 4 页
字号:
  }}doubleget_lambda(int **pam2p, int n0, int nsq, unsigned char *query) {  double lambda, H;  double *pr, tot, sum;  int i, ioff, j, min, max;  /* get min and max scores */  min = BIGNUM;  max = -BIGNUM;  if(pam2p[0][1] == -BIGNUM) {    ioff = 1;    n0++;  } else {    ioff = 0;  }  for (i = ioff ; i < n0 ; i++) {    for (j = 1; j <= nsq ; j++) {      if (min > pam2p[i][j])	min = pam2p[i][j];      if (max < pam2p[i][j])	max = pam2p[i][j];    }  }  /*  fprintf(stderr, "min: %d\tmax:%d\n", min, max); */    if ((pr = (double *) calloc(max - min + 1, sizeof(double))) == NULL) {    fprintf(stderr, "Couldn't allocate memory for score probabilities: %d\n", max - min + 1);    exit(1);  }  tot = (double) rrtotal * (double) rrtotal * (double) n0;  for (i = ioff ; i < n0 ; i++) {    for (j = 1; j <= nsq ; j++) {      pr[pam2p[i][j] - min] +=	(double) ((double) rrcounts[aascii[query[i]]] * (double) rrcounts[j]) / tot;    }  }  sum = 0.0;  for(i = 0 ; i <= max-min ; i++) {     sum += pr[i];    /*     fprintf(stderr, "%3d: %g %g\n", i+min, pr[i], sum); */  }  /*   fprintf(stderr, "sum: %g\n", sum); */  for(i = 0 ; i <= max-min ; i++) { pr[i] /= sum; }  if (!karlin(min, max, pr, &lambda, &H)) {    fprintf(stderr, "Karlin lambda estimation failed\n");  }  /*   fprintf(stderr, "lambda: %g\n", lambda); */  free(pr);  return lambda;}/*   *aa0 - query sequence   n0   - length   pamscale - scaling for pam matrix - provided by apam.c, either              0.346574 = ln(2)/2 (P120, BL62) or	      0.231049 = ln(2)/3 (P250, BL50) */voidscale_pssm(int **pssm2p, double **freq2d,	   unsigned char *query, int n0,	   int **pam2, double pamscale);static unsigned char ustandard_aa[] ="\0ARNDCQEGHILKMFPSTWYV";voidread_pssm(unsigned char *aa0, int n0, int nsq,	  double pamscale, 	  FILE *fp, int pgpf_type, struct pstruct *ppst) {  int i, j, len, k;  int qi, rj;	/* qi - index query; rj - index residues (1-20) */  int **pam2p;  int first, too_high;  unsigned char *query, ctmp;  char dline[512];  double freq, **freq2d, lambda, new_lambda;  double scale, scale_high, scale_low;  pam2p = ppst->pam2p[0];  if (pgpf_type == 0) {    if(1 != fread(&len, sizeof(int), 1, fp)) {      fprintf(stderr, "error reading from checkpoint file: %d\n", len);      exit(1);    }    if(len != n0) {      fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",	      len,n0);      exit(1);    }    /* read over query sequence stored in BLAST profile */    if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {      fprintf(stderr, "Couldn't allocate memory for query!\n");      exit(1);    }    if(len != fread(query, sizeof(char), len, fp)) {      fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);      exit(1);    }  }  else if (pgpf_type == 1) {    if ((fgets(dline,sizeof(dline),fp) == NULL)  ||	(1 != sscanf(dline, "%d",&len))) {      fprintf(stderr, "error reading from checkpoint file: %d\n", len);      exit(1);    }    if(len != n0) {      fprintf(stderr, "profile length (%d) and query length (%d) don't match!\n",	      len,n0);      exit(1);    }    /* read over query sequence stored in BLAST profile */    if(NULL == (query = (unsigned char *) calloc(len+2, sizeof(char)))) {      fprintf(stderr, "Couldn't allocate memory for query!\n");      exit(1);    }    if (fgets((char *)query,len+2,fp)==NULL) {      fprintf(stderr, "Couldn't read query sequence from profile: %s\n", query);      exit(1);    }  }    else {    fprintf(stderr," Unrecognized PSSM file type: %d\n",pgpf_type);    exit(1);  }  /* currently we don't do anything with query; ideally, we should     check to see that it actually matches aa0 ... */  /* quick 2d array alloc: */  if((freq2d = (double **) calloc(n0, sizeof(double *))) == NULL) {    fprintf(stderr, "Couldn't allocate memory for frequencies!\n");    exit(1);  }  if((freq2d[0] = (double *) calloc(n0 * 20, sizeof(double))) == NULL) {    fprintf(stderr, "Couldn't allocate memory for frequencies!\n");    exit(1);  }  /* a little pointer arithmetic to fill out 2d array: */  for (i = 1 ; i < n0 ; i++) {    freq2d[i] = freq2d[i-1] + 20;  }  if (pgpf_type == 0) {    for (qi = 0 ; qi < n0 ; qi++) {      for (rj = 0 ; rj < 20 ; rj++) {	if(1 != fread(&freq, sizeof(double), 1, fp)) {	  fprintf(stderr, "Error while reading frequencies!\n");	  exit(1);	}	freq2d[qi][rj] = freq;      }    }  }  else {    for (qi = 0 ; qi < n0 ; qi++) {      if ((fgets(dline,sizeof(dline),fp) ==NULL) ||      (k = sscanf(dline,"%c %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg %lg\n",		 &ctmp, &freq2d[qi][0], &freq2d[qi][1], &freq2d[qi][2], &freq2d[qi][3], &freq2d[qi][4], 		 &freq2d[qi][5], &freq2d[qi][6], &freq2d[qi][7], &freq2d[qi][8], &freq2d[qi][9],		 &freq2d[qi][10], &freq2d[qi][11], &freq2d[qi][12], &freq2d[qi][13], &freq2d[qi][14],		      &freq2d[qi][15], &freq2d[qi][16], &freq2d[qi][17], &freq2d[qi][18], &freq2d[qi][19]))<1) {	fprintf(stderr, "Error while reading frequencies: %d read!\n",k);	exit(1);      }      for (rj=0; rj<20; rj++) { freq2d[qi][rj] /= 10.0; }	/* reverse scaling */    }  }  scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);  free(freq2d[0]);  free(freq2d);  free(query);}voidscale_pssm(int **pssm2p, double **freq2d, unsigned char *query, int n0, int **pam2, double pamscale) {  int i, qi, rj;  double freq, new_lambda, lambda;  int first, too_high;  double scale, scale_high, scale_low;  for (qi = 0 ; qi < n0 ; qi++) {    for (rj = 0 ; rj < 20 ; rj++) {      if (freq2d[qi][rj] > 1e-20) {	freq = log(freq2d[qi][rj] /((double) (rrcounts[rj+1])/(double) rrtotal));	freq /= pamscale; /* this gets us close to originial pam scores */	freq2d[qi][rj] = freq;      }      else {			/* when blastpgp decides to leave something out, it puts 0's in all the frequencies	   in the binary checkpoint file.  In the ascii version, however, it uses BLOSUM62	   values.  I will put in scoring matrix values as well */	freq2d[qi][rj] = pam2[aascii[query[qi]]][rj+1];      }    }  }  /* now figure out the right scale */  scale = 1.0;  lambda = get_lambda(pam2, 20, 20, ustandard_aa);  /* should be near 1.0 because of our initial scaling by ppst->pamscale */  /* fprintf(stderr, "real_lambda: %g\n", lambda); */  /* get initial high/low scale values: */  first = 1;  while (1) {    fill_pam(pssm2p, n0, 20, freq2d, scale);    new_lambda = get_lambda(pssm2p, n0, 20, query);     if (new_lambda > lambda) {      if (first) {	first = 0;	scale = scale_high = 1.0 + 0.05;	scale_low = 1.0;	too_high = 1;      } else {	if (!too_high) break;	scale = (scale_high += scale_high - 1.0);      }    } else if (new_lambda > 0) {      if (first) {	first = 0;	scale_high = 1.0;	scale = scale_low = 1.0 - 0.05;	too_high = 0;      } else {	if (too_high) break;	scale = (scale_low += scale_low - 1.0);      }    } else {      fprintf(stderr, "new_lambda (%g) <= 0; matrix has positive average score", new_lambda);      exit(1);    }  }  /* now do binary search between low and high */  for (i = 0 ; i < 10 ; i++) {    scale = 0.5 * (scale_high + scale_low);    fill_pam(pssm2p, n0, 20, freq2d, scale);    new_lambda = get_lambda(pssm2p, n0, 20, query);        if (new_lambda > lambda) scale_low = scale;    else scale_high = scale;  }  scale = 0.5 * (scale_high + scale_low);  fill_pam(pssm2p, n0, 20, freq2d, scale);  /*  fprintf(stderr, "final scale: %g\n", scale);  for (qi = 0 ; qi < n0 ; qi++) {    fprintf(stderr, "%4d %c:  ", qi+1, query[qi]);    for (rj = 1 ; rj <= 20 ; rj++) {      fprintf(stderr, "%4d", pssm2p[qi][rj]);    }    fprintf(stderr, "\n");  }  */}#if defined(CAN_PSSM)intparse_pssm_asn_fa(FILE *afd, int *n_rows, int *n_cols,		  unsigned char **query, double ***freqs,		  char *matrix, int *gap_open, int *gap_extend,		  double *lambda);/* the ASN.1 pssm includes information about the scoring matrix used   (though not the gap penalty in the current version PSSM:2) The PSSM   scoring matrix and gap penalties should become the default if they   have not been set explicitly.*//* read the PSSM from an open FILE *fp - but nothing has been read   from *fp */intread_asn_pssm(unsigned char *aa0, int n0, int nsq,	      double pamscale, FILE *fp, struct pstruct *ppst) {  int i, j, len, k;  int qi, rj;	/* qi - index query; rj - index residues (1-20) */  int **pam2p;  int first, too_high;  unsigned char *query, ctmp;  char dline[512];  char matrix[MAX_SSTR];  double psi2_lambda;  double freq, **freq2d, lambda, new_lambda;  double scale, scale_high, scale_low;  int gap_open, gap_extend;  int n_rows, n_cols;  pam2p = ppst->pam2p[0];  if (parse_pssm_asn_fa(fp, &n_rows, &n_cols, &query, &freq2d,			matrix, &gap_open, &gap_extend, &psi2_lambda)<=0) {    return -1;  }  if (!gap_set) {    if (gap_open) {      if (gap_open > 0) {gap_open = -gap_open;}      ppst->gdelval = gap_open;    }    else if (strncmp(matrix,"BLOSUM62",8)==0) {      ppst->gdelval = -11;    }    gap_set = 1;  }  if (!del_set) {    if (gap_extend) {      if (gap_extend > 0) {gap_extend = -gap_extend;}      ppst->ggapval = gap_extend;    }    else if (strncmp(matrix,"BLOSUM62",8)==0) {      ppst->ggapval = -1;    }    del_set = 1;  }  if (strncmp(matrix, "BLOSUM62", 8)== 0 && !ppst->pam_set) {    strncpy(ppst->pamfile, "BL62", 120);    standard_pam(ppst->pamfile,ppst,del_set, gap_set);    if (!ppst->have_pam2) {     alloc_pam (MAXSQ, MAXSQ, ppst);    }    init_pam2(ppst);    ppst->pam_set = 1;  }  if (n_cols < n0) {     fprintf(stderr, " query length: %d != n_cols: %d\n",n0, n_cols);    exit(1);  }  scale_pssm(ppst->pam2p[0], freq2d, query, n0, ppst->pam2[0],pamscale);  free(freq2d[0]);  free(freq2d);  free(query);  return 1;}#endifvoidlast_params(unsigned char *aa0, int n0, 	    struct mngmsg *m_msg,	    struct pstruct *ppst#ifdef PCOMPLIB	    , struct qmng_str *qm_msg#endif	    ) {  int i, nsq;  FILE *fp;  if (n0 < 0) { return;}  ppst->n0 = m_msg->n0;  if (ppst->ext_sq_set) { nsq = ppst->nsqx; }  else {nsq = ppst->nsq;}#if defined(CAN_PSSM)  ppst->pam2p[0] = alloc_pam2p(ppst->pam2p[0],n0,nsq);  ppst->pam2p[1] = alloc_pam2p(ppst->pam2p[1],n0,nsq);  if (ppst->pam_pssm) {    if ((ppst->pgpfile_type == 0) && (fp=fopen(ppst->pgpfile,"rb"))) {      read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 0, ppst);      extend_pssm(aa0, n0, ppst);    }    else if ((ppst->pgpfile_type == 1) && (fp=fopen(ppst->pgpfile,"r"))) {      read_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, 1, ppst);      extend_pssm(aa0, n0, ppst);    }    else if ((ppst->pgpfile_type == 2) && (fp=fopen(ppst->pgpfile,"rb"))) {      if (read_asn_pssm(aa0, n0, ppst->nsq, ppst->pamscale, fp, ppst)>0) {	extend_pssm(aa0, n0, ppst);      }      else {	fprintf(stderr," Could not parse PSSM file: %s\n",ppst->pgpfile);	ppst->pam_pssm = 0;	return;      }    }    else {      fprintf(stderr," Could not open PSSM file: %s\n",ppst->pgpfile);      ppst->pam_pssm = 0;      return;    }  }#endif#if defined(FASTF) || defined(FASTS) || defined(FASTM)  m_msg->nm0 = 1;  for (i=0; i<n0; i++)    if (aa0[i]==EOSEQ || aa0[i]==ESS) m_msg->nm0++;/*  for FASTS, we can do statistics in one of two different ways  if there are <= 10 query fragments, then we calculate probabilistic  scores for every library sequence.  If there are > 10 fragments, this  takes much too long and too much memory, so we use the old fashioned  raw score only z-score normalized method initially, and then calculate  the probabilistic scores for the best hits.  To scale those scores, we  also need a set of random probabilistic scores.  So we do the qshuffle  to get them.  For FASTF, precalculating probabilities is prohibitively expensive,  so we never do it; FASTF always acts like FASTS with nfrags>10.*/#if defined(FASTS) || defined(FASTM)  if (m_msg->nm0 > 10) m_msg->escore_flg = 0;  else m_msg->escore_flg = 1;#endif  if (m_msg->escore_flg && (ppst->zsflag&1)) {    m_msg->last_calc_flg = 0;    m_msg->qshuffle = 0;  }  else {	/* need random query, second set of 2000 scores */    m_msg->last_calc_flg = 1;    m_msg->qshuffle = 1;  }#else#ifndef LALIGN  m_msg->last_calc_flg = 0;#else  m_msg->last_calc_flg = 1;#endif  m_msg->qshuffle = 0;  m_msg->escore_flg = 0;  m_msg->nm0 = 1;#endif/* adjust the ktup if appropriate */    if (!ktup_set && pgm_def_arr[ppst->pgm_id].ktup > 0) {    if (m_msg->qdnaseq == SEQT_PROT) {      ppst->param_u.fa.ktup = pgm_def_arr[ppst->pgm_id].ktup;#if defined(FASTS) || defined(FASTM)      if (n0 > 100 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;#endif      if (n0 < 40 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;    }    else if (m_msg->qdnaseq == SEQT_DNA || m_msg->qdnaseq == SEQT_RNA) {      if (n0 < 20 && ppst->param_u.fa.ktup > 1) ppst->param_u.fa.ktup = 1;#if defined(FASTS) || defined(FASTM)      /* with the current (April 12 2005) dropfs2.c - ktup cannot be > 2 */      else ppst->param_u.fa.ktup = 2;#else      else if (n0 < 50 && ppst->param_u.fa.ktup > 2) ppst->param_u.fa.ktup = 2;      else if (n0 < 100 && ppst->param_u.fa.ktup > 3)  ppst->param_u.fa.ktup = 3;#endif    }  }#ifdef PCOMPLIB  qm_msg->nm0 = m_msg->nm0;  qm_msg->escore_flg = m_msg->escore_flg;  qm_msg->qshuffle = m_msg->qshuffle;  qm_msg->pam_pssm = 0;#endif}/* given a good profile in ppst->pam2p[0], make an extended profile   in ppst->pam2p[1]*/voidextend_pssm(unsigned char *aa0, int n0, struct pstruct *ppst) {  int i, j, nsq;  int sa_x, sa_t, sa_b, sa_z, sa_j;  int **pam2p0, **pam2p1;  nsq = ppst->nsq;  pam2p0 = ppst->pam2p[0];  pam2p1 = ppst->pam2p[1];  sa_x = pascii['X'];  sa_t = pascii['*'];  if (sa_t >= ppst->nsq) {sa_t = sa_x;}  sa_b = pascii['B'];  sa_z = pascii['Z'];  sa_j = pascii['J'];  /* fill in boundaries, B, Z, *, X */  for (i=0; i<n0; i++) {    pam2p0[i][0] = -BIGNUM;    pam2p0[i][sa_b] = (int)      (((float)pam2p0[i][pascii['N']]+(float)pam2p0[i][pascii['D']]+0.5)/2.0);    pam2p0[i][sa_z] = (int)      (((float)pam2p0[i][pascii['Q']]+(float)pam2p0[i][pascii['E']]+0.5)/2.0);    pam2p0[i][sa_j] = (int)      (((float)pam2p0[i][pascii['I']]+(float)pam2p0[i][pascii['L']]+0.5)/2.0);    pam2p0[i][sa_x] = ppst->pam_xm;    pam2p0[i][sa_t] = ppst->pam_xm;  }  /* copy pam2p0 into pam2p1 */  for (i=0; i<n0; i++) {    pam2p1[i][0] = -BIGNUM;    for (j=1; j<=ppst->nsq; j++) {      pam2p1[i][j] = pam2p0[i][j];    }  }  /* then fill in extended characters, if necessary */  if (ppst->ext_sq_set) {    for (i=0; i<n0; i++) {      for (j=1; j<=ppst->nsq; j++) {	pam2p0[i][nsq+j] = pam2p0[i][j];	pam2p1[i][nsq+j] = ppst->pam_xm;      }    }  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -