📄 dropfz2.c
字号:
/* copyright (c) 1998, 1999 William R. Pearson and the U. of Virginia *//* $Name: fa35_03_06 $ - $Id: dropfz2.c,v 1.65 2008/02/19 08:50:13 wrp Exp $ *//* 18-Sept-2006 - removed static global variables for alignment *//* 2002/06/23 finally correctly implement fix to translate 'N' to 'X' *//* 1999/11/29 modification by Z. Zhang to translate DNA 'N' as 'X' *//* implements an improved version of the fasty algorithm, see: W. R. Pearson, T. Wood, Z. Zhang, A W. Miller (1997) "Comparison of DNA sequences with protein sequences" Genomics 46:24-36*/#include <stdio.h>#include <stdlib.h>#include <string.h>#include <math.h>#include <ctype.h>#include "defs.h"#include "param.h"#define XTERNAL#include "upam.h"#include "uascii.h"#define NT_N 16/* globals for fasta */#define MAXWINDOW 64#ifndef MAXSAV#define MAXSAV 10#endif#ifndef ALLOCN0static char *verstr="3.5 Sept 2006";#elsestatic char *verstr="3.5an0 Sept 2006";#endifstruct dstruct /* diagonal structure for saving current run */{ int score; /* hash score of current match */ int start; /* start of current match */ int stop; /* end of current match */ struct savestr *dmax; /* location in vmax[] where best score data saved */};struct savestr{ int score; /* pam score with segment optimization */ int score0; /* pam score of best single segment */ int gscore; /* score from global match */ int dp; /* diagonal of match */ int start; /* start of match in lib seq */ int stop; /* end of match in lib seq */};void savemax();void kpsort();struct sx_s {int C1, C2, C3, I1, I2, I3, flag; };struct wgt { int iii, ii, iv;};struct wgtc {char c2, c3, c4, c5;};typedef struct st_s { int C, I, D;} *st_ptr;struct f_struct { struct dstruct *diag; struct savestr vmax[MAXSAV]; /* best matches saved for one sequence */ struct savestr *vptr[MAXSAV]; struct savestr *lowmax; int ndo; int noff; int hmask; /* hash constants */ int *pamh1; /* pam based array */ int *pamh2; /* pam based kfact array */ int *link, *harr; /* hash arrays */ int kshft; /* shift width */ int nsav, lowscor; /* number of saved runs, worst saved run */#ifndef TFAST unsigned char *aa0x, *aa0v; /* aa0x - 111122223333 */#else unsigned char *aa1x, *aa1v; /* aa1x - 111122223333 */#endif /* aa1v - computed codons */ struct sx_s *cur; struct wgt **weight0; struct wgt **weight1; struct wgtc **weight_c; int *waa; int *res; int max_res; st_ptr up, down, tp;};#define DROP_INTERN#include "drop_func.h"static int dmatchx(const unsigned char *aa0, int n0, const unsigned char *aa1, int n1, int hoff, int window, int **pam2, int gdelval, int ggapval, int gshift, struct f_struct *f_str);int shscore(unsigned char *aa0, int n0, int **pam2);int saatran(const unsigned char *ntseq, unsigned char *aaseq, int maxs, int frame);int spam (const unsigned char *aa0, const unsigned char *aa1, struct savestr *dmax, int **pam2, struct f_struct *f_str);int sconn (struct savestr **v, int n,int cgap, int pgap, struct f_struct *f_str);int lx_band(const unsigned char *prot_seq, int len_prot, const unsigned char *dna_prot_seq, int len_dna_prot, int **pam_matrix, int gopen, int gext, int gshift, int start_diag, int width, struct f_struct *f_str);static void update_code(char *al_str, int al_str_max, int op, int op_cnt, char *op_char);extern void w_abort (char *p, char *p1);extern void aagetmap(char *to, int n);/* initialize for fasta *//* modified 30-August-1999 by Zheng Zhang to work with an extended alphabet *//* Assume naa=47, and wgts[47][23] matches both upper and lower caseamoino acids with another amino acid. And also assume the DNA letterdoes not have upper/lower case difference. If you also allow DNAsequence to be upper/lower case letters, more needs be changed. Notonly here, but also in the alignment code, the way that pack a codoninto a number between 0-63 need be changed. *//* modified so that if **weightci==NULL, do not fiddle with characters */voidinit_weights(struct wgt ***weighti, struct wgtc ***weightci, int **wgts, int gshift, int gsubs, int naa){ int i, j, do_wgtc=0; int aa, b, a, x, y, z; int *wwt, e; struct wgt **weight; struct wgtc **weightc; char aacmap[64]; int temp[MAXLC+1][64]; /*change*/ char le[MAXLC+1][64]; if (naa > MAXLC) { fprintf(stderr,"*** dropfz2.c compilation problem %d > %d ***\n", naa, MAXLC); } if ((*weighti=(struct wgt **)calloc((size_t)(naa+1),sizeof(struct wgt *))) ==NULL) { fprintf(stderr," cannot allocate weights array: %d\n",naa); exit(1); } weight = *weighti; for (aa=0; aa <= naa; aa++) { if ((weight[aa]=(struct wgt *)calloc((size_t)256,sizeof(struct wgt))) ==NULL) { fprintf(stderr," cannot allocate weight[]: %d/%d\n",aa,naa); exit(1); } } if (weightci !=NULL) { if ((*weightci=(struct wgtc **)calloc((size_t)(naa+1), sizeof(struct wgtc *)))==NULL) { fprintf(stderr," cannot allocate weight_c array: %d\n",naa); exit(1); } weightc = *weightci; for (aa=0; aa <= naa; aa++) { if ((weightc[aa]=(struct wgtc *)calloc((size_t)256,sizeof(struct wgtc))) ==NULL) { fprintf(stderr," cannot allocate weightc[]: %d/%d\n",aa,naa); exit(1); } } do_wgtc = 1; } else do_wgtc = 0; aagetmap(aacmap,64); for (aa = 0; aa <= naa; aa++) { /* change*/ wwt = wgts[aa]; /* pam matrix */ for (i = 0; i < 64; i++) { /* i iterates through the codons */ x = -1000; /* large negative */ y = i; for (j = 0; j < 64; j++) { /* j iterates through the codons */ z = ((~i & j) | (i & ~j)); b = 0; /* score = 0 */ if (z % 4) b-= gsubs; if (z /16) b-= gsubs; if ((z /4) % 4) b -= gsubs; b += wwt[aascii[aacmap[j]]]; /* add the match score for char j*/ if (b > x) { x = b; /* x has the score */ y = j; /* y has the character */ } } /* if (y < 0 || y > 63) printf("%d %d %d %d ",aa, i, x, y); */ temp[aa][i] = x; le[aa][i] = y; } /* printf("\n"); */ } for (aa= 0; aa <= naa; aa++) { wwt = temp[aa]; for (i = 0; i < 256; i++) { for (x=-100,b = 0; b < 4; b++) { z = (i/ (1 << ((b+1)*2)))*(1<<(b*2))+(i%(1<<(b*2))); if (x < (e=wwt[z])) { x = e; if (do_wgtc) weightc[aa][i].c4 = aacmap[le[aa][z]]; } } weight[aa][i].iv=x-gshift; weight[aa][i].iii = wwt[i%64]; if (do_wgtc) { weightc[aa][i].c5 = aacmap[le[aa][i%64]]; weightc[aa][i].c3 = aacmap[i%64]; } x = i %16; for (y = -100, b = 0; b < 3; b++) { z = ((x >> (b*2)) << (b*2+2)) + (x % (1 << (b*2))); for (a = 0; a < 4; a++) { if ((e =wwt[z+(a<<(b*2))]) > y) { y = e; if (do_wgtc) weightc[aa][i].c2 = aacmap[le[aa][z+(a<<(b*2))]]; } } } weight[aa][i].ii = y-gshift; } } /*106=CGGG*/ for (aa = 0; aa <= naa; aa++) { weight[aa][106].iii = wgts[aa][23]; /* is 23 the code for 'X'?*/ weight[aa][106].iv = weight[aa][106].ii = weight[aa][106].iii-gshift; if (do_wgtc) { weightc[aa][106].c5 = weightc[aa][106].c4 = weightc[aa][106].c3 = weightc[aa][106].c2 = 'X'; } }}voidfree_weights(struct wgt ***weighti0, struct wgt ***weighti1, struct wgtc ***weightci, int naa){ int aa; struct wgt **weight0; struct wgt **weight1; struct wgtc **weightc; weight0 = *weighti0; weight1 = *weighti1; weightc = *weightci; for (aa=0; aa <= naa; aa++) {free(weight0[aa]);} for (aa=0; aa <= naa; aa++) {free(weight1[aa]);} for (aa=0; aa <= naa; aa++) {free(weightc[aa]);} free(weight0); free(weight1); free(weightc);}static voidpre_com(const unsigned char *aa0, int n0, unsigned char *aa0v){ int dnav, i; dnav = (hnt[aa0[0]]<<2) + hnt[aa0[1]]; for (i=2; i<n0; i++) { dnav = ((dnav<<2)+hnt[aa0[i]])&255; if (aa0[i] == NT_N || aa0[i-1]==NT_N || aa0[i-2] == NT_N) aa0v[i-2] = 106; else { if (dnav == 106/*CGGG*/) dnav = 42/*AGGG*/; aa0v[i-2]=dnav; } }}static voidpre_com_r(const unsigned char *aa0, int n0, unsigned char *aa0v){ int dnav, i, ir; dnav = (3-hnt[aa0[n0-1]]<<2) + 3-hnt[aa0[n0-2]]; for (i=2, ir=n0-3; i<n0; i++,ir--) { dnav = ((dnav<<2)+3-hnt[aa0[ir]])&255; if (aa0[ir] == NT_N || aa0[ir+1]==NT_N || aa0[ir+2] == NT_N) aa0v[i-2] = 106; else { if (dnav == 106) dnav = 42; aa0v[i-2]=dnav; } }}voidinit_work (unsigned char *aa0, int n0, struct pstruct *ppst, struct f_struct **f_arg){ int mhv, phv; int hmax; int i0, hv; int pamfact; int btemp; struct f_struct *f_str; struct bdstr *bss; /* these used to be globals, but do not need to be */ int ktup, fact, kt1, lkt; int maxn0; int *pwaa; int i, j, q; struct swstr *ss, *r_ss; int *waa; int *res; int nsq, ip, *hsq, naat;#ifndef TFAST int last_n0, itemp, dnav; unsigned char *fd, *fs, *aa0x, *aa0v; int n0x, n0x3;#endif if (nt[NT_N] != 'N') { fprintf(stderr," nt[NT_N] (%d) != 'X' (%c) - recompile\n",NT_N,nt[NT_N]); exit(1); } if (ppst->ext_sq_set) { nsq = ppst->nsqx; ip = 1; hsq = ppst->hsqx; } else { nsq = ppst->nsq; ip = 0; hsq = ppst->hsq; } f_str = (struct f_struct *)calloc(1,sizeof(struct f_struct)); btemp = 2 * ppst->param_u.fa.bestoff / 3 + n0 / ppst->param_u.fa.bestscale + ppst->param_u.fa.bkfact * (ppst->param_u.fa.bktup - ppst->param_u.fa.ktup); btemp = min (btemp, ppst->param_u.fa.bestmax); if (btemp > 3 * n0) btemp = 3 * shscore(aa0,n0,ppst->pam2[0]) / 5; ppst->param_u.fa.cgap = btemp + ppst->param_u.fa.bestoff / 3; if (ppst->param_u.fa.optcut_set != 1)#ifndef TFAST ppst->param_u.fa.optcut = (btemp*5)/4;#else ppst->param_u.fa.optcut = (btemp*4)/3;#endif#ifdef OLD_FASTA_GAP ppst->param_u.fa.pgap = ppst->gdelval + ppst->ggapval;#else ppst->param_u.fa.pgap = ppst->gdelval + 2*ppst->ggapval;#endif pamfact = ppst->param_u.fa.pamfact; ktup = ppst->param_u.fa.ktup; fact = ppst->param_u.fa.scfact * ktup;#ifndef TFAST /* before hashing, we must set up some space and translate the sequence */ maxn0 = n0 + 2; if ((aa0x =(unsigned char *)calloc((size_t)maxn0, sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa0x array %d\n", maxn0); exit (1); } aa0x++; f_str->aa0x = aa0x; if ((aa0v =(unsigned char *)calloc((size_t)maxn0, sizeof(unsigned char))) == NULL) { fprintf (stderr, "cannot allocate aa0v array %d\n", maxn0); exit (1); } aa0v++; f_str->aa0v = aa0v; /* make a precomputed codon number series */ pre_com(aa0, n0, aa0v); last_n0 = 0; for (itemp=0; itemp<3; itemp++) { n0x=saatran(aa0,&aa0x[last_n0],n0,itemp); /* for (i=0; i<n0x; i++) { fprintf(stderr,"%c",aa[aa0x[last_n0+i]]); if ((i%60)==59) fprintf(stderr,"\n"); } fprintf(stderr,"\n"); */ last_n0 += n0x+1; } /* fprintf(stderr,"\n"); */ n0x = n0; n0x3 = n0x/3; /* now switch aa0 and aa0x for hashing functions */ fs = aa0; aa0 = aa0x; aa0x = fs;#endif if (ppst->ext_sq_set) naat = MAXLC; else naat = MAXUC; init_weights(&f_str->weight0, NULL, ppst->pam2[ip],-ppst->gshift,-ppst->gsubs,naat); init_weights(&f_str->weight1, &f_str->weight_c, ppst->pam2[0],-ppst->gshift,-ppst->gsubs,naat); if (pamfact == -1) pamfact = 0; else if (pamfact == -2) pamfact = 1; for (i0 = 1, mhv = -1; i0 <= ppst->nsq; i0++) if (hsq[i0] < NMAP && hsq[i0] > mhv) mhv = ppst->hsq[i0]; if (mhv <= 0) { fprintf (stderr, " maximum hsq <=0 %d\n", mhv); exit (1); } for (f_str->kshft = 0; mhv > 0; mhv /= 2) f_str->kshft++;/* kshft = 2; */ kt1 = ktup - 1; hv = 1; for (i0 = 0; i0 < ktup; i0++) hv = hv << f_str->kshft; hmax = hv; f_str->hmask = (hmax >> f_str->kshft) - 1; if ((f_str->harr = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash array\n"); exit (1); } if ((f_str->pamh1 = (int *) calloc (ppst->nsq+1, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh1 array\n"); exit (1); } if ((f_str->pamh2 = (int *) calloc (hmax, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate pamh2 array\n"); exit (1); } if ((f_str->link = (int *) calloc (n0, sizeof (int))) == NULL) { fprintf (stderr, " cannot allocate hash link array"); exit (1); } for (i0 = 0; i0 < hmax; i0++) f_str->harr[i0] = -1; for (i0 = 0; i0 < n0; i0++) f_str->link[i0] = -1; /* encode the aa0 array */ phv = hv = 0; lkt = kt1; for (i0 = 0; i0 < min(n0,lkt); i0++) { if (hsq[aa0[i0]] >= NMAP) { hv=phv=0; lkt = i0+ktup; continue; } hv = (hv << f_str->kshft) + ppst->hsq[aa0[i0]]; phv += ppst->pam2[ip][aa0[i0]][aa0[i0]] * ktup; } for (; i0 < n0; i0++) { if (hsq[aa0[i0]] >= NMAP) { hv=phv=0; /* restart hv, phv calculation */ for (lkt = i0+kt1; (i0 < lkt || hsq[aa0[i0]]>=NMAP) && i0<n0; i0++) { if (hsq[aa0[i0]] >= NMAP) { hv=phv=0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -