📄 dropfs2.c
字号:
/* copyright (c) 1998, 1999 William R. Pearson and the U. of Virginia *//* $Name: fa35_03_06 $ - $Id: dropfs2.c,v 1.45 2008/02/19 08:50:13 wrp Exp $ *//* changed to return 2.0, rather than -1.0, for failure *//* Feb 4, 2005 - modifications to allow searches with ktup=2 for very long queries. This is a temporary solution to savemax(), spam() which do not preserve exact matches do_fasts() has been modified to allow higher maxsav for do_walign than for do_work (2*nsegs, 6*nsegs) *//* this code implements the "fasts" algorithm, which compares a set of protein fragments to a protein sequence. Comma's are used to separate the sequence fragments, which need not be the same length. The expected input is: >mgstm1 MGDAPDFD, MILGYW, MLLEYTDS The fragments do not need to be in the correct order (which is presumably unknown from the peptide sequencing.*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include <math.h>#include "defs.h"#include "param.h"#include "tatstats.h"#define EOSEQ 0#define ESS 49#define NMAP_X 23 /* for 'X' */#define NMAP_Z 24 /* for '*' */#define MAXHASH 32#define NMAP MAXHASH+1static char *verstr="4.32 Feb 2007";#define DROP_INTERN#include "drop_func.h"int shscore(const unsigned char *aa0, const int n0, int **pam2, int nsq);static void update_code(char *al_str, int al_str_max, int op, int op_cnt, int fnum);extern void aancpy(char *to, char *from, int count, struct pstruct *ppst);#ifdef TFASTextern int aatran(const unsigned char *ntseq, unsigned char *aaseq, const int maxs, const int frame);#endifvoid savemax(struct dstruct *, struct f_struct *, int maxsav, int exact,int t_end);int spam(const unsigned char *, const unsigned char *, int, struct savestr *, int **, struct f_struct *);int sconn(struct savestr **v, int nsave, struct f_struct *, struct rstruct *, struct pstruct *, const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int opt_prob);void kpsort(struct savestr **, int);void kssort(struct savestr **, int); /* sort by score */int sconn_a(unsigned char *, int, const unsigned char *, int, struct f_struct *, struct a_res_str *, struct pstruct *);void kpsort(struct savestr **, int);/* initialize for fasta */voidinit_work (unsigned char *aa0, const int n0, struct pstruct *ppst, struct f_struct **f_arg ){ int mhv, phv; int hmax, nsegs; int i0, ib, hv, old_hv; int pamfact; struct f_struct *f_str; /* these used to be globals, but do not need to be */ int ktup, fact, kt1; int maxn0; int stmp; /* temporary score */ int i, j, q; int tat_size; int *res; unsigned char *query; int k, l, m, n, N, length, index; double *tatprobptr; f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct)); ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval; ktup = ppst->param_u.fa.ktup; if ( ktup > 2 ) { ktup = ppst->param_u.fa.ktup = 2; } fact = ppst->param_u.fa.scfact; /* fasts3 cannot work with lowercase symbols as low complexity; thus, NMAP must be disabled; this depends on aascii['X'] */ if (ppst->hsq[NMAP_X] == NMAP ) {ppst->hsq[NMAP_X]=1;} if (ppst->hsq[NMAP_Z] == NMAP ) {ppst->hsq[NMAP_Z]=1;} /* this does not work in a threaded environment */ /* else {fprintf(stderr," cannot find 'X'==NMAP\n");} */ for (i0 = 1, mhv = -1; i0 <= ppst->nsq; i0++) if (ppst->hsq[i0] < NMAP && ppst->hsq[i0] > mhv) mhv = ppst->hsq[i0]; if (mhv <= 0) { fprintf (stderr, " maximum hsq <=0 %d\n", mhv); exit (1); } for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;/* kshft = 2; */ kt1 = ktup-1; hv = 1; for (i0 = 0; i0 < ktup; i0++) hv = hv << f_str->kshft; hmax = hv; f_str->hmask = (hmax >> f_str->kshft) - 1; if ((f_str->aa0t = (unsigned char *) calloc(n0+1, sizeof(char))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0t array; %d\n",n0+1); exit (1); } if ((f_str->aa0ti = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0ti array; %d\n",n0+1); exit (1); } if ((f_str->aa0b = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0b array; %d\n",n0+1); exit (1); } if ((f_str->aa0e = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0e array; %d\n",n0+1); exit (1); } if ((f_str->aa0i = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0i array; %d\n",n0+1); exit (1); } if ((f_str->aa0s = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0s array; %d\n",n0+1); exit (1); } if ((f_str->aa0l = (int *) calloc(n0+1, sizeof(int))) == NULL) { fprintf (stderr, " cannot allocate f_str0->aa0l array; %d\n",n0+1); exit (1); } if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash array: hmax: %d hmask: %d\n", hmax, f_str->hmask); exit (1); } if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh1 array\n"); exit (1); } if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh2 array\n"); exit (1); } if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash link array"); exit (1); } /* for FASTS/FASTM, we want to know when we get to the end of a peptide, so we can ensure that we set the end and restart */ if ((f_str->l_end = (int *) calloc (n0, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate link end array"); exit (1); } for (i0 = 0; i0 < hmax; i0++) f_str->harr[i0] = -1; for (i0 = 0; i0 < n0; i0++) f_str->link[i0] = -1; for (i0 = 0; i0 < n0; i0++) f_str->l_end[i0] = 0; /* count the number of peptides */ nsegs = 1; for (i0 = 0; i0 < n0; i0++) { if (aa0[i0] == ESS || aa0[i0] == 0) nsegs++; } /* allocate space for peptides offsets, nm_u */ if ((f_str->nmoff = (int *)calloc(nsegs+1, sizeof(int)))==NULL) { fprintf(stderr, " cannot allocat nmoff array: %d\n", nsegs); exit(1); } if ((f_str->nm_u = (int *)calloc(nsegs+1, sizeof(int)))==NULL) { fprintf(stderr, " cannot allocat nm_u array: %d\n", nsegs); exit(1); } phv = hv = 0; f_str->nmoff[0] = 0; f_str->nm0 = 1; /* encode the aa0 array */ if (kt1 > 0) { hv = ppst->hsq[aa0[0]]; phv = ppst->pam2[0][aa0[0]][aa0[0]]; } for (i0=kt1 ; i0 < n0; i0++) { if (aa0[i0] == ESS || aa0[i0] == 0) { /* fprintf(stderr," converted %d to 0\n",aa0[i0]); */ aa0[i0] = EOSEQ; /* set ESS to 0 */ f_str->nmoff[f_str->nm0++] = i0+1; f_str->l_end[i0-1] = 1; phv = hv = 0; if (kt1 > 0) { i0++; hv = ppst->hsq[aa0[i0]]; phv = ppst->pam2[0][aa0[i0]][aa0[i0]]; } continue; } hv = ((hv & f_str->hmask) << f_str->kshft) + ppst->hsq[aa0[i0]]; f_str->link[i0] = f_str->harr[hv]; f_str->harr[hv] = i0; f_str->pamh2[hv] = (phv += ppst->pam2[0][aa0[i0]][aa0[i0]]); phv -= ppst->pam2[0][aa0[i0 - kt1]][aa0[i0 - kt1]]; } f_str->l_end[n0-1] = 1; f_str->nmoff[f_str->nm0] = n0+1; /*#ifdef DEBUG fprintf(stderr, ">>%s\n",qtitle); for (j=0; j<f_str->nm0; j++) { for (i=f_str->nmoff[j]; i < f_str->nmoff[j+1]-1; i++) { fprintf(stderr,"%c",ppst->sq[aa0[i]]); } fprintf(stderr," %d\n",aa0[i]); } for (j=1; j<=ppst->nsq; j++) { fprintf(stderr, "%c %d\n", ppst->sq[j], f_str->harr[j]); } for (j=0; j<=n0; j++) { fprintf(stderr, "%c %d\n", ppst->sq[aa0[j]], f_str->link[j]); }#endif */ /* build an integer array of the max score that can be achieved from that position - use in savemax to mark some segments as fixed */ /* setup aa0b[], aa0e[], which specify the begining and end of each segment */ stmp = 0; q = -1; for (ib = i0 = 0; i0 < n0; i0++) { f_str->aa0l[i0] = i0 - q; if (aa0[i0]==EOSEQ) { f_str->aa0b[i0] = -1; f_str->aa0e[i0] = -1; f_str->aa0i[i0] = -1; f_str->aa0l[i0] = -1; q = i0; if (i0 > 0)f_str->aa0s[i0-1] = stmp; stmp = 0; ib++; } else { stmp += ppst->pam2[0][aa0[i0]][aa0[i0]]; } f_str->aa0b[i0] = f_str->nmoff[ib]; f_str->aa0e[i0] = f_str->nmoff[ib+1]-2; f_str->aa0i[i0] = ib; /* fprintf(stderr,"%2d %c: %2d %2d %2d\n",i0,ppst->sq[aa0[i0]], f_str->aa0b[i0],f_str->aa0e[i0],f_str->aa0i[i0]); */ } f_str->aa0s[n0-1]=stmp; /* save last best possible score */ /* maxsav - maximum number of peptide alignments saved in search */ /* maxsav_w - maximum number of peptide alignments saved in alignment */ f_str->maxsav = max(MAXSAV,2*f_str->nm0); f_str->maxsav_w = max(MAXSAV,6*f_str->nm0); if ((f_str->vmax = (struct savestr *) calloc(f_str->maxsav_w,sizeof(struct savestr)))==NULL) { fprintf(stderr, "Couldn't allocate vmax[%d].\n",f_str->maxsav_w); exit(1); } if ((f_str->vptr = (struct savestr **) calloc(f_str->maxsav_w,sizeof(struct savestr *)))==NULL) { fprintf(stderr, "Couldn't allocate vptr[%d].\n",f_str->maxsav_w); exit(1); } if ((f_str->sarr = (struct slink *) calloc(f_str->maxsav_w,sizeof(struct slink)))==NULL) { fprintf(stderr, "Couldn't allocate sarr[%d].\n",f_str->maxsav_w); exit(1); } /* Tatusov Statistics Setup */ /* initialize priors array. */ if((f_str->priors = (double *)calloc(ppst->nsq+1, sizeof(double))) == NULL) { fprintf(stderr, "Couldn't allocate priors array.\n"); exit(1); } calc_priors(f_str->priors, ppst, f_str, NULL, 0, ppst->pseudocts); /* pre-calculate the Tatusov probability array for each full segment */ if(ppst->zsflag >= 1 && ppst->zsflag <= 3 && f_str->nm0 <= 10) { tat_size = (1<<f_str->nm0) -1; f_str->dotat = 1; f_str->tatprobs = (struct tat_str **) malloc((size_t)tat_size*sizeof(struct tat_str *)); if (f_str->tatprobs == NULL) { fprintf (stderr, " cannot allocate tatprobs array: %ld\n", tat_size * sizeof(struct tat_str *)); exit (1); } f_str->intprobs = (double **) malloc((size_t)tat_size * sizeof(double *)); if(f_str->intprobs == NULL) { fprintf(stderr, "Couldn't allocate intprobs array.\n"); exit(1); } for(k = 0, l = f_str->nm0 ; k < l ; k++) { query = &(aa0[f_str->nmoff[k]]); length = f_str->nmoff[k+1] - f_str->nmoff[k] - 1; /* this segment alone */ index = (1 << k) - 1; generate_tatprobs(query, 0, length - 1, f_str->priors, ppst->pam2[0], ppst->nsq, &(f_str->tatprobs[index]), NULL); /* integrate the probabilities */ N = f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore; tatprobptr = (double *) calloc(N+1, sizeof(double)); if(tatprobptr == NULL) { fprintf(stderr, "Couldn't calloc tatprobptr.\n"); exit(1); } f_str->intprobs[index] = tatprobptr; for (i = 0; i <= N ; i++ ) { tatprobptr[i] = f_str->tatprobs[index]->probs[i]; for (j = i + 1 ; j <= N ; j++ ) { tatprobptr[i] += f_str->tatprobs[index]->probs[j]; } } /* this segment built on top of all other subcombinations */ for(i = 0, j = (1 << k) - 1 ; i < j ; i++) { index = (1 << k) + i; generate_tatprobs(query, 0, length - 1, f_str->priors, ppst->pam2[0], ppst->nsq, &(f_str->tatprobs[index]), f_str->tatprobs[i]); /* integrate the probabilities */ N = f_str->tatprobs[index]->highscore - f_str->tatprobs[index]->lowscore; tatprobptr = (double *) calloc(N+1, sizeof(double)); if(tatprobptr == NULL) { fprintf(stderr, "Couldn't calloc tatprobptr.\n"); exit(1); } f_str->intprobs[index] = tatprobptr; for (m = 0; m <= N ; m++ ) { tatprobptr[m] = f_str->tatprobs[index]->probs[m]; for (n = m + 1 ; n <= N ; n++ ) { tatprobptr[m] += f_str->tatprobs[index]->probs[n]; } } } } } else { f_str->dotat = 0; f_str->shuff_cnt = ppst->shuff_node; } /* End of Tatusov Statistics Setup */ /* for (i0=1; i0<=ppst->nsq; i0++) { fprintf(stderr," %c: %2d ",ppst->sq[i0],f_str->harr[i0]); hv = f_str->harr[i0]; while (hv >= 0) { fprintf(stderr," %2d",f_str->link[hv]); hv = f_str->link[hv]; } fprintf(stderr,"\n"); } *//* this has been modified from 0..<ppst->nsq to 1..<=ppst->nsq because the pam2[0][0] is now undefined for consistency with blast*/ for (i0 = 1; i0 <= ppst->nsq; i0++) f_str->pamh1[i0] = ppst->pam2[0][i0][i0]; f_str->ndo = 0; f_str->noff = n0-1; if (f_str->diag==NULL) f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG, sizeof (struct dstruct)); if (f_str->diag == NULL) { fprintf (stderr, " cannot allocate diagonal arrays: %ld\n", (long) MAXDIAG * (long) (sizeof (struct dstruct))); exit (1); }#ifdef TFAST if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2, sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+2); exit (1); } f_str->aa1x++;#endif maxn0 = max(3*n0/2,MIN_RES); if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) { fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0); exit(1); } f_str->res = res; f_str->max_res = maxn0; *f_arg = f_str;}/* pstring1 is a message to the manager, currently 512 *//* pstring2 is the same information, but in a markx==10 format */voidget_param (struct pstruct *ppstr, char **pstring1, char *pstring2){#ifdef FASTS#ifndef TFAST char *pg_str="FASTS";#else char *pg_str="TFASTS";#endif#endif#ifdef FASTM#ifndef TFAST char *pg_str="FASTM";#else char *pg_str="TFASTM";#endif#endif sprintf (pstring1[0], "%s (%s)",pg_str,verstr); sprintf (pstring1[1], "%s matrix (%d:%d) ktup=%d", ppstr->pamfile, ppstr->pam_h,ppstr->pam_l, ppstr->param_u.fa.ktup); if (ppstr->param_u.fa.iniflag) strcat(pstring1[0]," init1"); if (pstring2 != NULL) { sprintf (pstring2, "; pg_name: %s\n; pg_ver: %s\n; pg_matrix: %s (%d:%d)\n\; pg_gap-pen: %d %d\n; pg_ktup: %d\n", pg_str,verstr,ppstr->pamfile, ppstr->pam_h,ppstr->pam_l, ppstr->gdelval, ppstr->ggapval,ppstr->param_u.fa.ktup); }}voidclose_work (const unsigned char *aa0, const int n0, struct pstruct *ppst, struct f_struct **f_arg){ struct f_struct *f_str; int i, j; f_str = *f_arg; if (f_str != NULL) { free(f_str->res);#ifdef TFAST free(f_str->aa1x - 1); /* because f_str->aa1x got ++'ed when allocated! */#endif free(f_str->diag); free(f_str->l_end); free(f_str->link); free(f_str->pamh2); free(f_str->pamh1); free(f_str->harr); free(f_str->vmax); free(f_str->vptr); free(f_str->sarr); free(f_str->aa0i); free(f_str->aa0e); free(f_str->aa0b); free(f_str->aa0ti); free(f_str->aa0t); free(f_str->nmoff); free(f_str->nm_u); if(f_str->dotat) { for(i = 0, j = (1 << f_str->nm0) - 1 ; i < j ; i++) { free(f_str->tatprobs[i]->probs); free(f_str->tatprobs[i]); free(f_str->intprobs[i]); } free(f_str->tatprobs); free(f_str->intprobs); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -