📄 dropfx.c
字号:
/* copyright (c) 1998, 1999 William R. Pearson and the U. of Virginia *//* $Name: fa35_03_06 $ - $Id: dropfx.c,v 1.74 2008/02/19 08:50:13 wrp Exp $ *//* implements the fastx algorithm, see: W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of DNA sequences with protein sequences" Genomics 46:24-36 see dropnfa.c for better variable descriptions and comments*/ /* 18-Sept-2006 - remove global variables used for alignment *//* 22-June-2006 - correct incorrect alignment coordinates generated after pro_dna() on projected DNA region. *//* 9-May-2003 -> 3.46 changed lx_band to use projected protein boundary end. this fixes some addressing issues on MacOSX, and speeds up alignment on very long proteins*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include <math.h>#include "defs.h"#include "param.h"#define XTERNAL#include "upam.h"/* this must be consistent with upam.h */#define MAXHASH 32#define NMAP MAXHASH+1/* globals for fasta */#define MAXWINDOW 64#ifndef MAXSAV#define MAXSAV 10#endif#ifndef ALLOCN0static char *verstr="3.5 Sept 2006";#elsestatic char *verstr="3.5an0 May 2006";#endifstruct dstruct /* diagonal structure for saving current run */{ int score; /* hash score of current match */ int start; /* start of current match */ int stop; /* end of current match */ struct savestr *dmax; /* location in vmax[] where best score data saved */};struct savestr{ int score; /* pam score with segment optimization */ int score0; /* pam score of best single segment */ int gscore; /* score from global match */ int dp; /* diagonal of match */ int start; /* start of match in lib seq */ int stop; /* end of match in lib seq */};struct swstr { int H, E;};struct bdstr { int CC, DD, CP, DP;};void savemax();void kpsort();struct sx_s {int C1, C2, C3, I1, I2, I3, flag; };struct f_struct { struct dstruct *diag; struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */ struct savestr *vptr[MAXSAV]; struct savestr *lowmax; int ndo; int noff; int hmask; /* hash constants */ int *pamh1; /* pam based array */ int *pamh2; /* pam based kfact array */ int *link, *harr; /* hash arrays */ int kshft; /* shift width */ int nsav, lowscor; /* number of saved runs, worst saved run */#ifndef TFAST unsigned char *aa0x; /* contains translated codons 111222333*/ unsigned char *aa0y; /* contains translated codons 123123123*/#else unsigned char *aa1x; /* contains translated codons 111222333 */ unsigned char *aa1y; /* contains translated codons 123123123 */#endif struct sx_s *cur; int *waa0; int *waa1; int *res; int max_res;};#define DROP_INTERN#include "drop_func.h"static int dmatchx(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int hoff, int window, int **pam2, int gdelval, int ggapval, int gshift, struct f_struct *f_str);int shscore(unsigned char *aa0, int n0, int **pam2);int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);int spam (const unsigned char *aa0, const unsigned char *aa1, struct savestr *dmax, int **pam2, struct f_struct *f_str);int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str);int lx_band(const unsigned char *prot_seq, int len_prot, const unsigned char *dna_prot_seq, int len_dna_prot, int **pam_matrix, int gopen, int gext, int gshift, int start_diag, int width, struct f_struct *f_str);static voidupdate_code(char *al_str, int al_str_max, int op, int op_cnt, char *op_char);extern void w_abort (char *p, char *p1);/* initialize for fasta */voidinit_work (unsigned char *aa0, int n0, struct pstruct *ppst, struct f_struct **f_arg){ int mhv, phv; int hmax; int i0, hv; int pamfact; int btemp; struct f_struct *f_str; int ktup; /* word size examined */ int fact; /* factor used to scale ktup match value */ int kt1; /* ktup-1 */ int lkt; /* last ktup - initiall kt1, but can be increased for hsq >= NMAP */ int maxn0; int *pwaa; int i, j, q; struct swstr *ss, *r_ss; int *waa; int *res; int nsq, ip, *hsq;#ifndef TFAST int last_n0, itemp; unsigned char *fd, *fs, *aa0x, *aa0y, *aa0s; int n0x, n0x3;#endif if (ppst->ext_sq_set) { nsq = ppst->nsqx; ip = 1; hsq = ppst->hsqx; } else { nsq = ppst->nsq; ip = 0; hsq = ppst->hsq; } f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct)); btemp = 2 * ppst->param_u.fa.bestoff / 3 + n0 / ppst->param_u.fa.bestscale + ppst->param_u.fa.bkfact * (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup); btemp = min (btemp, ppst->param_u.fa.bestmax); if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5; ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3; if (ppst->param_u.fa.optcut_set != 1)#ifndef TFAST ppst->param_u.fa.optcut = (btemp*5)/4;#else ppst->param_u.fa.optcut = (btemp*4)/3;#endif#ifdef OLD_FASTA_GAP ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;#else ppst->param_u.fa.pgap = ppst->gdelval;#endif pamfact = ppst->param_u.fa.pamfact; ktup = ppst->param_u.fa.ktup; fact = ppst->param_u.fa.scfact * ktup; if (pamfact == -1) pamfact = 0; else if (pamfact == -2) pamfact = 1; for (i0 = 1, mhv = -1; i0 <=nsq; i0++) if (hsq[i0] < NMAP && hsq[i0] > mhv) mhv = hsq[i0]; if (mhv <= 0) { fprintf (stderr, " maximum hsq <=0 %d\n", mhv); exit (1); } for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;/* kshft = 2; */ kt1 = ktup - 1; hv = 1; for (i0 = 0; i0 < ktup; i0++) { hv = hv << f_str->kshft; } hmax = hv; f_str->hmask = (hmax >> f_str->kshft) - 1; if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash array\n"); exit (1); } if ((f_str->pamh1 = (int *) calloc (nsq+1, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh1 array\n"); exit (1); } if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh2 array\n"); exit (1); } if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash link array"); exit (1); }#ifdef TFAST if ((f_str->aa1x =(unsigned char *)calloc((size_t)ppst->maxlen+2, sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa1x array %d\n", ppst->maxlen+2); exit (1); } f_str->aa1x++; if ((f_str->aa1y =(unsigned char *)calloc((size_t)ppst->maxlen+2, sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa1y array %d\n", ppst->maxlen+2); exit (1); } f_str->aa1y++;#else /* FASTX */ maxn0 = n0 + 2; if ((aa0x =(unsigned char *)calloc((size_t)maxn0,sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0); exit (1); } aa0x++; f_str->aa0x = aa0x; if ((aa0y =(unsigned char *)calloc((size_t)maxn0,sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa0y array %d\n", maxn0); exit (1); } aa0y++; f_str->aa0y = aa0y; last_n0 = 0; for (itemp=0; itemp<3; itemp++) { n0x = saatran(aa0,&aa0x[last_n0],n0,itemp); /* for (i=0; i<n0x; i++) { fprintf(stderr,"%c",aa[aa0x[last_n0+i]]); if ((i%60)==59) fprintf(stderr,"\n"); } fprintf(stderr,"\n"); */ last_n0 += n0x+1; } /* fprintf(stderr,"\n"); */ for (itemp=0, fs=aa0x; itemp <3; itemp++,fs++) { for (fd = &aa0y[itemp]; *fs!=EOSEQ; fd += 3, fs++) *fd = *fs; *fd=EOSEQ; } /* now switch aa0 and aa0x for hashing functions */ /* this seems dangerous in threaded code, but only the pointer is changed, not the data itself */ fs = aa0; aa0 = aa0x; aa0x = fs; #endif for (i0 = 0; i0 < hmax; i0++) f_str->harr[i0] = -1; for (i0 = 0; i0 < n0; i0++) f_str->link[i0] = -1; /* encode the aa0 array */ phv = hv = 0; lkt = kt1; for (i0 = 0; i0 < min(lkt,n0); i0++) { if (hsq[aa0[i0]] >= NMAP) {hv=phv=0; lkt=i0+ktup; continue;} hv = (hv << f_str->kshft) + hsq[aa0[i0]]; phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup; } for (; i0 < n0; i0++) { if (hsq[aa0[i0]] >= NMAP) { hv=phv=0; lkt = i0+ktup; /* restart hv, phv calculation */ for (; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) { if (hsq[aa0[i0]] >= NMAP) {hv=phv=0; lkt = i0+ktup; continue;} hv = (hv << f_str->kshft) + hsq[aa0[i0]]; phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup; } } if (i0 >= n0) break; hv = ((hv & f_str->hmask) << f_str->kshft) + hsq[aa0[i0]]; f_str->link[i0] = f_str->harr[hv]; f_str->harr[hv] = i0; if (pamfact) { f_str->pamh2[hv] = (phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup); /* this check should always be true, but just in case */ if (hsq[aa0[i0-kt1]]<NMAP) phv -= ppst->pam2[ip][aa0[i0 - kt1]][aa0[i0 - kt1]] * ktup; } else f_str->pamh2[hv] = fact * ktup; }#ifndef TFAST /* done hashing, now switch aa0, aa0x back */ fs = aa0; aa0 = aa0x; aa0x = fs;#endif/* this has been modified from 0..<nsq to 1..<=nsq because the pam2[0][0] is now undefined for consistency with blast*/ if (pamfact) for (i0 = 1; i0 <= nsq; i0++) f_str->pamh1[i0] = ppst->pam2[ip][i0][i0] * ktup; else for (i0 = 1; i0 <= nsq; i0++) f_str->pamh1[i0] = fact; f_str->ndo = 0; /* used to save time on diagonals with long queries */#ifndef ALLOCN0 if ((f_str->diag = (struct dstruct *) calloc ((size_t)MAXDIAG, sizeof (struct dstruct)))==NULL) { fprintf (stderr," cannot allocate diagonal arrays: %ld\n", (long) MAXDIAG *sizeof (struct dstruct)); exit (1); };#else if ((f_str->diag = (struct dstruct *) calloc ((size_t)n0, sizeof (struct dstruct)))==NULL) { fprintf (stderr," cannot allocate diagonal arrays: %ld\n", (long)n0*sizeof (struct dstruct)); exit (1); };#endif if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) { fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0); exit(1); } pwaa = waa; for (i=0; i<=nsq; i++) { for (j=0;j<n0; j++) { *pwaa = ppst->pam2[ip][i][aa0[j]]; pwaa++; } } f_str->waa0 = waa; if ((waa= (int *)malloc (sizeof(int)*(nsq+1)*n0)) == NULL) { fprintf(stderr,"cannot allocate waa struct %3d\n",nsq*n0); exit(1); } pwaa = waa; for (i=0; i<=nsq; i++) { for (j=0;j<n0; j++) { *pwaa = ppst->pam2[0][i][aa0[j]]; pwaa++; } } f_str->waa1 = waa;#ifndef TFAST maxn0 = max(2*n0,MIN_RES);#else /* maxn0 needs to be large enough to accomodate introns for TFASTX. For all other functions, it will be more reasonable. */ maxn0 = max(4*n0,MIN_RES);#endif if ((res = (int *)calloc((size_t)maxn0,sizeof(int)))==NULL) { fprintf(stderr,"cannot allocate alignment results array %d\n",maxn0); exit(1); } f_str->res = res; f_str->max_res = maxn0; *f_arg = f_str;}/* pstring1 is a message to the manager, currently 512 *//* pstring2 is the same information, but in a markx==10 format */voidget_param (struct pstruct *ppst, char **pstring1, char *pstring2){#ifndef TFAST char *pg_str="FASTX";#else char *pg_str="TFASTX";#endif if (!ppst->param_u.fa.optflag) { sprintf (pstring1[0], "%s (%s)",pg_str,verstr); } else { sprintf (pstring1[0], "%s (%s) [optimized]",pg_str,verstr); }#ifdef OLD_FASTA_GAP sprintf (pstring1[1], "%s matrix (%d:%d:%d)%s ktup: %d\n join: %d, gap-pen: %d/%d, shift: %d width: %3d",#else sprintf (pstring1[1], "%s matrix (o=%d:%d:%d:%d)%s ktup: %d\n join: %d, open/ext: %d/%d, shift: %d width: %3d",#endif ppst->pamfile, ppst->pam_h,ppst->pam_l,ppst->pam_xx,ppst->pam_xm, (ppst->ext_sq_set) ? "xS":"\0", ppst->param_u.fa.ktup, ppst->param_u.fa.cgap, ppst->gdelval, ppst->ggapval, ppst->gshift, ppst->param_u.fa.optwid); if (ppst->param_u.fa.iniflag) strcat(pstring1[0]," init1"); if (pstring2 != NULL) {#ifdef OLD_FASTA_GAP sprintf (pstring2, "; pg_name: %s\n; pg_ver: %s\n; pg_matrix: %s (%d:%d)%s\n\; pg_gap-pen: %d %d\n; pg_ktup: %d\n; pg_optcut: %d\n; pg_cgap: %d\n",#else sprintf (pstring2, "; pg_name: %s\n; pg_ver: %s\n; pg_matrix: %s (%d:%d)%s\n\; pg_open_ext: %d %d\n; pg_ktup: %d\n; pg_optcut: %d\n; pg_cgap: %d\n",#endif pg_str,verstr,ppst->pamfile, ppst->pam_h,ppst->pam_l, (ppst->ext_sq_set) ? "xS":"\0", ppst->gdelval, ppst->ggapval,ppst->param_u.fa.ktup,ppst->param_u.fa.optcut, ppst->param_u.fa.cgap); }}voidclose_work (const unsigned char *aa0, int n0, struct pstruct *ppst, struct f_struct **f_arg){ struct f_struct *f_str; f_str = *f_arg; if (f_str != NULL) { free(f_str->cur);#ifndef TFAST f_str->aa0y--; free(f_str->aa0y); f_str->aa0x--; free(f_str->aa0x);#else f_str->aa1y--; free(f_str->aa1y); f_str->aa1x--; free(f_str->aa1x);#endif free(f_str->res); free(f_str->waa1); free(f_str->waa0); free(f_str->diag); free(f_str->link); free(f_str->pamh2); free(f_str->pamh1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -