📄 dpalign.c
字号:
if (c == ' ') { /* within a gap */ nspaces++; /* size of gap */ } else { /* new position in motif */ spaces[k] = MAX(spaces[k], nspaces); /* max spaces in any alignment*/ k++; /* pointer in aligned motif */ nspaces = 0; } if (!c) break; /* end of aligned motif */ } /* position in aligned motif */ } /* alignment */ /* create the alignment */ for (i=nspaces=0; i<=w; i++) nspaces += spaces[i]; /* total spaces */ create_2array(alignment, char, n+1, w+nspaces+1); for (i=0; i<n+1; i++) { for (j=0; j<w+nspaces; j++) { alignment[i][j] = ' '; } alignment[i][j] = '\0'; } /* put motif in first line of alignment */ Resize(pos1, maxl+1, int); for (i=j=0; i<=w; i++, j++) { j += spaces[i]; /* insert spaces before */ pos1[i] = j; /* where motif i is in align */ alignment[0][j] = cons[i]; /* place letter */ } /* put sequences into alignment */ for (i=0; i<n; i++) { /* seq2 */ for (j=k=l=0; seq2[i][j]; j++, l++) { /* j: position in pairwise al. k: position in motif l: position in final algmnt. */ if (motif[i][j] != ' ') l = pos1[k++]; /* map motif_pos->final_pos */ alignment[i+1][l] = seq2[i][j]; } } /* seq2 */#ifdef DEBUG /* print alignment */ printf("alignment:\n"); printf("%10.10s .%s\n", "motif", alignment[0]); for (i=1; i<n+1; i++) printf("%10.10s .%s\n", names[i-1], alignment[i]);#endif /* free up all space */ free_2array(motif, n); myfree(seq2); /* sequences freed by motif */ myfree(pos1); myfree(spaces); return alignment;} /* dp_ma_motif_seqs *//******************************************************************************//* g_align Find the longest alignment of width at least minw with g or fewer gaps per column. Different values of g are tried [0..] until a g-alignment of width minw or greater is found. The legal alignments must start in position left or greater, and end in position right or less. Returns the starting point and width of the g-alignment, and sets off, the offset of the starting point of the g-alignment.*//******************************************************************************/extern int g_align( char **ma, /* the multiple alignment */ int nseq, /* number of sequences in alignment (including the key sequence) */ int ncol, /* number of columns in alignment */ int left, /* leftmost allowed start */ int right, /* rightmost allowed end */ int minw, /* minimum width */ int *off, /* offset of g-alignment rel. to left */ int *w /* width of g-alignment */){ int i, j, pos; int *ngaps=NULL; /* number of gaps in each column */ int lalign; /* length of g-aligment */ int g; /* allowed gaps per column */ int best_w=0, best_end=left; /* width and end of longest g-align */ /* too few sequences? */ if (nseq < 2) { *w = right-left+1; *off = 0; return(0); } /* create arrays for gaps/col */ Resize(ngaps, ncol, int); /* count the number of gapped sequences at each position; if there is a gap in the first sequence, subtract the number of gaps from nseq */ for (i=0; i<ncol; i++) { /* position in alignment */ ngaps[i] = 0; for (j=1; j<nseq; j++) if (ma[j][i] == ' ') ngaps[i]++; if (ma[0][i] == ' ') ngaps[i] = nseq - ngaps[i] - 1; } /* position in alignment */ /* find the longest alignment with g or fewer gaps per column and width at least minw */ for (g=0; g<nseq; g++) { /* g */ /* fill in the length of g-alignment ending at i */ best_end = left; /* assume start of alignm. */ lalign = (!left && ma[0][0] != ' ' && ngaps[0] <= g) ? 1 : 0; lalign = 0; for (i=pos=best_w=0; i<ncol; i++) { /* position in align & motif */ if (pos>=left && ma[0][i] != ' ') lalign++; /* alignment length */ if (ngaps[i] > g) lalign = 0; /* bad column--too many gaps */ if (pos>=left && lalign>best_w) { /* within motif and better */ best_end = pos; best_w = lalign; } if (ma[0][i] != ' ') pos++; /* next position in motif */ if (pos>right) break; /* at right edge */ } /* position in alignment & motif */ if (best_w >= minw) break; /* found a legal one */ } /* g */ best_w = MAX(best_w, minw); /* make sure at least min_w */ /* set up return values */ *w = best_w; *off = best_end-best_w+1-left; /* relative to leftmost start *//* printf("g_align: Old w %d nsites %d Best w = %d offset = %d with g %d\n", right-left+1, nseq, *w, *off, g);*/ /* free space */ myfree(ngaps); return(g);} /* g_align *//******************************************************************************//* dp_multi_align Perform a multiple alignment of a motif versus a set of sequences. Uses MEME data structures. Logodds matrix uses back as background frequencies. Returns an array of null terminated, aligned sequences.*//******************************************************************************/extern char **dp_multi_align( P_PROB sites, /* the sites */ int nsites, /* the number of sites */ int w, /* width of sites */ int flank, /* add flank cols on left+rgt */ DATASET *dataset /* the dataset */){ int i; SAMPLE **samples = dataset->samples; /* sequences */ int alength = dataset->alength; /* dataset */ double wg = dataset->wg; /* gap cost (initialization) */ double ws = dataset->ws; /* space cost (extension) */ BOOLEAN endgaps = dataset->endgaps; /* penalize end gaps if TRUE */ THETA lomap = dataset->lomap; /* seq to theta logodds map */ char **eseqs=NULL; /* encoded sequences */ int *lengths=NULL; /* lengths of sequences */ char **names=NULL; /* names of sequences */ LOGODDS lo=NULL; /* logodds matrix */ char *seq = NULL; /* ascii for first sequence */ char **ma; /* multiple aligment */ /* create the array of sequences */ Resize(eseqs, nsites, char *); Resize(lengths, nsites, int); Resize(names, nsites, char *); /* prepare input for dp_ma_motif_seqs */ for (i=0; i<nsites; i++) { /* site */ int x = sites[i].x; /* sequence of site */ int y = sites[i].y; /* position in sequence */ BOOLEAN ic = sites[i].ic; /* site on - strand? */ SAMPLE *s= samples[x]; /* sample */ int lseq = s->length; /* length of sequence */ char *eseq = ic ? s->resic : s->res; /* encoded sequence */ int left = MAX(0, y-flank); /* left edge of subsequence */ int right = MIN(lseq-1, y+w+flank-1); /* right edge of subsequence */ eseqs[i] = eseq+left; /* subsequence */ lengths[i] = right-left+1; /* subsequence length */ names[i] = s->sample_name; /* sequence name */ } /* site */ /* convert the first site to a logodds matrix */ create_2array(lo, double, lengths[0], alength+1); init_theta(lo, eseqs[0], lengths[0], lomap, alength); /* get the ascii version of the first site */ Resize(seq, lengths[0]+1, char); for (i=0; i<lengths[0]; i++) seq[i] = unhash(eseqs[0][i]); seq[i] = '\0'; /* get multiple alignment */ ma = dp_ma_motif_seqs(alength, lengths[0], lo, seq, nsites-1, eseqs+1, lengths+1, names+1, wg, ws, endgaps); /* free space */ free_2array(lo, lengths[0]); myfree(eseqs); myfree(lengths); myfree(names); myfree(seq); /* return multiple alignment */ return ma;} /* dp_multi_align */#ifdef DPALIGN_SO/******************************************************************************//* dpalign*//******************************************************************************/int main( int argc, char** argv){ int i, j; double wg=11, ws=1, m=1; BOOLEAN endgaps = FALSE; int alen; int w; char *alphabet = PROTEINB; int length; char *eseq=NULL; LOGODDS logodds; char *seq1, *seq2; char *pa; i = 1; argv[0] = "dpalign"; DO_STANDARD_COMMAND_LINE(2, USAGE(<seq1> <seq2> [options]); USAGE(\n\t<seq1>\t\tfirst sequence to align); USAGE(\t<seq2>\t\tsecond sequence to align); NON_SWITCH(1, \r, switch (i++) { case 1: seq1 = _OPTION_; break; case 2: seq2 = _OPTION_; break; default: COMMAND_LINE_ERROR; } ); DATA_OPTN(1, wg, <wg>, \tgap cost, wg = atof(_OPTION_)); DATA_OPTN(1, ws, <ws>, \tspace cost, ws = atof(_OPTION_)); DATA_OPTN(1, m, <ws>, \tmatch score, m = atof(_OPTION_)); FLAG_OPTN(1, eg, \tpenalize end gaps, endgaps = TRUE); USAGE(\tGlobally align two sequences.); USAGE(\n\tCopyright); USAGE(\t(1999) The Regents of the University of California); USAGE(\tAll Rights Reserved.); USAGE(\tAuthor: Timothy L. Bailey); ); /* Set up the hashing functions for mapping between letters or codons and alphabet positions. */ setup_hash_alph(PROTEINB); /* PROTEINB to position hash */ alen = strlen(alphabet); /* convert seq1 to logodds matrix */ w = strlen(seq1); create_2array(logodds, LOGODDSB, w, alen); for (i=0; i<w; i++) { for (j=0; j<alen; j++) { logodds(i,j) = hash(seq1[i]) == j ? m : -m; } } /* convert seq2 to integer encoding */ length = strlen(seq2); Resize(eseq, length, char); for (i=0; i<length; i++) eseq[i] = hash(seq2[i]); /* align the sequences */ pa = dp_align(alen, w, logodds, seq1, length, eseq, wg, ws, endgaps);} /* main */#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -