dpalign.c

来自「EM算法的改进」· C语言代码 · 共 599 行 · 第 1/2 页
599 行
      if (c == ' ') {				/* within a gap */        nspaces++;				/* size of gap */      } else {					/* new position in motif */        spaces[k] = MAX(spaces[k], nspaces);	/* max spaces in any alignment*/        k++;					/* pointer in aligned motif */        nspaces = 0;      }      if (!c) break;				/* end of aligned motif */    } /* position in aligned motif */  } /* alignment */  /* create the alignment */  for (i=nspaces=0; i<=w; i++) nspaces += spaces[i];	/* total spaces */   create_2array(alignment, char, n+1, w+nspaces+1);  for (i=0; i<n+1; i++) {    for (j=0; j<w+nspaces; j++) {      alignment[i][j] = ' ';    }    alignment[i][j] = '\0';  }  /* put motif in first line of alignment */  Resize(pos1, maxl+1, int);  for (i=j=0; i<=w; i++, j++) {    j += spaces[i];				/* insert spaces before */    pos1[i] = j;				/* where motif i is in align */    alignment[0][j] = cons[i];			/* place letter */  }  /* put sequences into alignment */  for (i=0; i<n; i++) {				/* seq2 */    for (j=k=l=0; seq2[i][j]; j++, l++) {	/* j: position in pairwise al. 						   k: position in motif						   l: position in final algmnt.						*/      if (motif[i][j] != ' ') l = pos1[k++];	/* map motif_pos->final_pos */      alignment[i+1][l] = seq2[i][j];    }  } /* seq2 */#ifdef DEBUG  /* print alignment */  printf("alignment:\n");  printf("%10.10s .%s\n", "motif", alignment[0]);  for (i=1; i<n+1; i++) printf("%10.10s .%s\n", names[i-1], alignment[i]);#endif  /* free up all space */  free_2array(motif, n);  myfree(seq2);					/* sequences freed by motif */  myfree(pos1);  myfree(spaces);  return alignment;} /* dp_ma_motif_seqs *//******************************************************************************//*	g_align	Find the longest alignment of width at least minw with g or fewer gaps 	per column.  Different values of g are tried [0..] until a g-alignment 	of width minw or greater is found.  The legal alignments must	start in position left or greater, and end in position right or less.	Returns the starting point and width of the g-alignment, and sets	off, the offset of the starting point of the g-alignment.*//******************************************************************************/extern int g_align(  char **ma,				/* the multiple alignment */  int nseq,				/* number of sequences in alignment 					   (including the key sequence) */  int ncol,				/* number of columns in alignment */  int left,				/* leftmost allowed start */  int right,				/* rightmost allowed end */  int minw,				/* minimum width */  int *off,				/* offset of g-alignment rel. to left */  int *w				/* width of g-alignment */){  int i, j, pos;  int *ngaps=NULL;			/* number of gaps in each column */  int lalign;				/* length of g-aligment */  int g;				/* allowed gaps per column */  int best_w=0, best_end=left;		/* width and end of longest g-align */  /* too few sequences? */  if (nseq < 2) {    *w = right-left+1;    *off = 0;    return(0);  }   /* create arrays for gaps/col */  Resize(ngaps, ncol, int);  /* count the number of gapped sequences at each position; if there is     a gap in the first sequence, subtract the number of gaps from nseq  */   for (i=0; i<ncol; i++) {			/* position in alignment */    ngaps[i] = 0;    for (j=1; j<nseq; j++) if (ma[j][i] == ' ') ngaps[i]++;    if (ma[0][i] == ' ') ngaps[i] = nseq - ngaps[i] - 1;  } /* position in alignment */  /*     find the longest alignment with g or fewer gaps per column and width    at least minw  */  for (g=0; g<nseq; g++) {			/* g */    /* fill in the length of g-alignment ending at i */    best_end = left;				/* assume start of alignm. */    lalign = (!left && ma[0][0] != ' ' && ngaps[0] <= g) ? 1 : 0;    lalign = 0;    for (i=pos=best_w=0; i<ncol; i++) {		/* position in align & motif */      if (pos>=left && ma[0][i] != ' ') lalign++;	/* alignment length */      if (ngaps[i] > g) lalign = 0;		/* bad column--too many gaps */      if (pos>=left && lalign>best_w) {		/* within motif and better */	best_end = pos;	best_w = lalign;      }      if (ma[0][i] != ' ') pos++;		/* next position in motif */      if (pos>right) break;			/* at right edge */    } /* position in alignment & motif */    if (best_w >= minw) break;			/* found a legal one */  } /* g */  best_w = MAX(best_w, minw);			/* make sure at least min_w */  /* set up return values */  *w = best_w;  *off = best_end-best_w+1-left;		/* relative to leftmost start *//*  printf("g_align: Old w %d nsites %d Best w = %d offset = %d with g %d\n",     right-left+1, nseq, *w, *off, g);*/  /* free space */  myfree(ngaps);  return(g);} /* g_align *//******************************************************************************//*	dp_multi_align	Perform a multiple alignment of a motif versus a set of sequences.	Uses MEME data structures.  Logodds matrix uses back as	background frequencies.	Returns an array of null terminated, aligned sequences.*//******************************************************************************/extern char **dp_multi_align(  P_PROB sites,					/* the sites */  int nsites,					/* the number of sites */  int w,					/* width of sites */  int flank,					/* add flank cols on left+rgt */  DATASET *dataset 				/* the dataset */){  int i;  SAMPLE **samples = dataset->samples;		/* sequences */  int alength = dataset->alength;		/* dataset */  double wg = dataset->wg;			/* gap cost (initialization) */   double ws = dataset->ws; 			/* space cost (extension) */   BOOLEAN endgaps = dataset->endgaps; 		/* penalize end gaps if TRUE */  THETA lomap = dataset->lomap;			/* seq to theta logodds map */  char **eseqs=NULL;				/* encoded sequences */  int *lengths=NULL;				/* lengths of sequences */  char **names=NULL;				/* names of sequences */  LOGODDS lo=NULL;				/* logodds matrix */  char *seq = NULL;				/* ascii for first sequence  */  char **ma;					/* multiple aligment */  /* create the array of sequences */  Resize(eseqs, nsites, char *);  Resize(lengths, nsites, int);   Resize(names, nsites, char *);  /* prepare input for dp_ma_motif_seqs */  for (i=0; i<nsites; i++) {			/* site */    int x = sites[i].x;				/* sequence of site */    int y = sites[i].y;				/* position in sequence */    BOOLEAN ic = sites[i].ic;			/* site on - strand? */    SAMPLE *s= samples[x];			/* sample */    int lseq = s->length;			/* length of sequence */    char *eseq = ic ? s->resic : s->res;	/* encoded sequence */    int left = MAX(0, y-flank);			/* left edge of subsequence */    int right = MIN(lseq-1, y+w+flank-1);	/* right edge of subsequence */    eseqs[i] = eseq+left;			/* subsequence */    lengths[i] = right-left+1;			/* subsequence length */    names[i] = s->sample_name;			/* sequence name */  } /* site */  /* convert the first site to a logodds matrix */  create_2array(lo, double, lengths[0], alength+1);  init_theta(lo, eseqs[0], lengths[0], lomap, alength);  /* get the ascii version of the first site */  Resize(seq, lengths[0]+1, char);  for (i=0; i<lengths[0]; i++) seq[i] = unhash(eseqs[0][i]);  seq[i] = '\0';  /* get multiple alignment */  ma = dp_ma_motif_seqs(alength, lengths[0], lo, seq,     nsites-1, eseqs+1, lengths+1, names+1, wg, ws, endgaps);  /* free space */  free_2array(lo, lengths[0]);  myfree(eseqs);  myfree(lengths);  myfree(names);  myfree(seq);  /* return multiple alignment */  return ma;} /* dp_multi_align */#ifdef DPALIGN_SO/******************************************************************************//*	dpalign*//******************************************************************************/int main(  int argc,  char** argv){  int i, j;  double wg=11, ws=1, m=1;  BOOLEAN endgaps = FALSE;  int alen;  int w;  char *alphabet = PROTEINB;  int length;  char *eseq=NULL;  LOGODDS logodds;  char *seq1, *seq2;  char *pa;  i = 1;  argv[0] = "dpalign";  DO_STANDARD_COMMAND_LINE(2,    USAGE(<seq1> <seq2> [options]);    USAGE(\n\t<seq1>\t\tfirst sequence to align);    USAGE(\t<seq2>\t\tsecond sequence to align);    NON_SWITCH(1, \r,      switch (i++) {        case 1: seq1 = _OPTION_; break;        case 2: seq2 = _OPTION_; break;        default: COMMAND_LINE_ERROR;      }    );    DATA_OPTN(1, wg, <wg>, \tgap cost, wg = atof(_OPTION_));    DATA_OPTN(1, ws, <ws>, \tspace cost, ws = atof(_OPTION_));    DATA_OPTN(1, m, <ws>, \tmatch score, m = atof(_OPTION_));    FLAG_OPTN(1, eg, \tpenalize end gaps, endgaps = TRUE);    USAGE(\tGlobally align two sequences.);    USAGE(\n\tCopyright);    USAGE(\t(1999) The Regents of the University of California);    USAGE(\tAll Rights Reserved.);    USAGE(\tAuthor: Timothy L. Bailey);  );  /*    Set up the hashing functions for mapping between letters or codons and    alphabet positions.  */  setup_hash_alph(PROTEINB);                    /* PROTEINB to position hash */  alen = strlen(alphabet);  /* convert seq1 to logodds matrix */  w = strlen(seq1);  create_2array(logodds, LOGODDSB, w, alen);  for (i=0; i<w; i++) {    for (j=0; j<alen; j++) {      logodds(i,j) = hash(seq1[i]) == j ? m : -m;     }  }  /* convert seq2 to integer encoding */  length = strlen(seq2);  Resize(eseq, length, char);  for (i=0; i<length; i++) eseq[i] = hash(seq2[i]);  /* align the sequences */  pa = dp_align(alen, w, logodds, seq1, length, eseq, wg, ws, endgaps);} /* main */#endif
dpalign.c - 源码说明

本页面展示了「EM算法的改进」中的 dpalign.c 源码文件，采用 C语言编程语言编写，共 599 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与法的改进相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?