mast.c

来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页

C
1,794
字号
      /* 	set flag if sequence is known positive and doing ROC       */      if (kmotifs) {	if (hash_lookup(sample_name, 1, motif[0].ht)) {	  seq->known_pos = TRUE;	} else if (hash_lookup(sample_name, 0, motif[0].ht)) {	 /* skip sequences labeled col=0 in .tag file; "?/P" in Prosite */	  (*n_hits)--;	} else {	  seq->known_pos = FALSE;	}      } /* ROC */    } /* hit */    /*       print progress report    */    if (status && *nseqs % 100 == 0)       fprintf(stderr, "\rsequences: %6d ", *nseqs);    /*      flip strand flag    */    if (stype==Separate) strand *= -1;    /*      free up space     */    if (strand != -1) {				/* don't free if first strand */      if (!kmotifs) myfree(sample_name);	/* don't free if doing ROC */      myfree(sequence);      myfree(id);	    }  } /* read_sequence */  /*    recalculate the p-values based on actual sequence composition  */  for (i=0; use_seq_comp && i < (*n_hits); i++) {    int j;    double **pv = NULL;    long length = seqs[i].length;    SORT_SEQ *seq = seqs + i;     double *best_score = seq->score;    int *best_loc = seq->loc;    /* skip if E-value too big */    if (seq->Pvalue * (*nseqs) > e_max) continue;	    /*      calculate p-values of all integer score values in range [0...w*RANGE]     */    Resize(pv, nmotifs, double *);    for (j=0; j<nmotifs; j++) {      /* pv[j] = calc_cdf(los[j], range, seq->comp);*/      pv[j] = calc_pssm_cdf(los[j]->w, los[j]->alen, range, los[j]->logodds, seq->comp);    }    /*       recalculate the combined p-value for this sequence using the actual      sequence composition as the background model    */    seq->Pvalue = calc_p_value(NULL, length, stype, nmotifs, los, best_score,       best_loc, range, pv, sonly, lump, norder, debug);#ifdef DEBUG    if (isnan(seq->Pvalue)) {      fprintf(stderr, "nan: i %d id %s pvalue %f\n", i, seq->id, seq->Pvalue);       for (j=0; j<alen; j++) fprintf(stderr, "%c %f\n", unhash(j), seq->comp[j]);      abort();    }#endif    /*       print progress report    */    if (status && i && i % 100 == 0)       fprintf(stderr, "\rrecalc p-value sequences: %6d ", i);    /*       free pv space if E-value too big, otherwise      save composition-based pv distribution    */    if (seq->Pvalue * (*nseqs) > e_max) {      for (j=0; j<nmotifs; j++) myfree(pv[j]);      myfree(pv);    } else {      seq->pv = pv;    }  } /* recalculate p-values */  /*     sort the sequences by p-value ascending order   */  qsort((char *)seqs, *n_hits, (int)sizeof(SORT_SEQ), p_compare);  /*     bail if no sequences were read succesfully   */  if (*nseqs == 0) {    fprintf(stderr, "Quitting due to errors or empty database.\n");     exit(1);  }  return seqs;} /* get_scores *//**********************************************************************//*	get_motifs	Read in the log-odds matrices (motifs) from standard input,         remove unused motifs and interpret the ordering and spacing diagram.	Exits if there is an error in the input.	Returns number of motifs in the (compacted) los array.	Updates los as well as globals order and space.	Updates s2b.*//**********************************************************************/static int get_motifs(  BOOLEAN xlate_dna,		/* database is DNA and motifs protein */  char *nlogodds,		/* name of log-odds matrix */  char *alphabet,		/* alphabet of logodds matrices */  char *blast_alphabet,		/* corresponding BLAST alphabet */  int *p[MAXASCII],		/* alphabet permutation/substitution matrix */  BOOLEAN shuffle,		/* shuffle the motif columns */  double min_ev,		/* minimum E-value of motifs to use */  int umotifs, 			/* number motifs to be used */  int mlist[MAXLO],		/* list of motifs given in -m option */  BOOLEAN motifs[MAXLO],	/* list of motifs to output */  LO *los[MAXLO],		/* array of logodds matrices */  double *f,			/* array of null letter probabilities */  int range 			/* set logodds matrices to [0..range] */){  int i, imotif, cnt;  int nmotifs;			/* number of log-odds matrices in los */  /*     read in log-odds matrices   */  nmotifs = read_log_odds(xlate_dna, nlogodds, alphabet, blast_alphabet,    p, range, los, f);  /*    check that at least one motif was read  */  if (nmotifs == 0) {    fprintf(stderr, "No scoring matrices found.\n");    exit(1);   }  /*     clear the list of flags showing motifs to use  */  for (i=0; i<MAXLO; i++) motifs[i] = FALSE;  /*    set flags of motifs to use; check for valid motif numbers   */  if (umotifs > 0) {			/* using specified motifs */    for (i=0; i<umotifs; i++) {      int m = mlist[i];      if (m < 1 || m > nmotifs) {        fprintf(stderr, "Motif %d in -m option not in legal range: 1 to %d.\n",          m, nmotifs);         exit(1);      }      motifs[m-1] = TRUE;    }  } else {				/* using motifs with E-value < min_ev */    for (i=0; i<nmotifs; i++) {      motifs[i] = (los[i]->ev < min_ev);      umotifs++;    }  }  /*    flag motifs with no information  */  for (i=0; i<nmotifs; i++) {    if (motifs[i] && !los[i]->scale) {		/* motif has no info */      motifs[i] = FALSE;      umotifs--;    }  }  /*    check that valid motifs remain  */  if (!umotifs) {    fprintf(stderr, "No scoring matrices contained any information.\n");    exit(1);   }  /*     Parse the ordering and spacing diagram.    Variables norder, order and space are globals defined in diagram.h   */  if (diagram == NULL) {   norder = 0;					/* no diagram */  } else {					/* have a diagram */    BOOLEAN fail = FALSE;			/* ok */    BOOLEAN used[MAXLO];			/* for checking use once */    dptr = 0;					/* diag read pointer */    norder = 0;					/* lnumber of motifs in diag */    for (i=0; i<MAXLO; i++) space[i] = 0;	/* set all spacers to zero */     if (yyparse()) exit(1);			/* parse diagram; sets globals						   norder, order, space */    /* check that no unused motifs are in motif diagram */    for (i=0; i<norder; i++) {      int m = order[i];				/* motif number (plus 1) */      if (m < 1 				/* number too small */          || m > nmotifs			/* number too large */          || !motifs[m-1]			/* motif not being used */      ) {	fprintf(stderr, "Unknown or unused motif %d given in motif diagram.\n",          m);        fail = TRUE;      }    }    /* check that each motif only used once in diagram */    for (i=0; i<MAXLO; i++) used[i] = FALSE;     for (i=0; i<norder; i++) {      int m = order[i];      if (used[m]) {	fprintf(stderr,           "Motif %d used more than once in motif diagram:\n  %s\n", m, diagram);	fail = TRUE;      } else {	used[m] = TRUE;      }    }        /* quit if there was a problem in the diagram */    if (fail) exit(1);    /* change format for spacers to be relative to start of prior motif */    space[0] = -1;				/* ignore first spacer */    for (i=1; i<norder; i++) {      int m = order[i-1];			/* previous motif in diagram  */      space[i] += los[m-1]->w; 			/* internal format is -1 */    }  }    /*     remove any motifs that are not to be used   */  for (imotif=0, cnt=0; imotif<nmotifs; imotif++) {	/* motif */    if (motifs[imotif]) {      los[cnt++] = los[imotif];			/* logodds matrix */    }  } /* motif */  nmotifs = cnt;				/* number of used motifs */  /*     convert spacing diagram to internal motif names   */  for (i=0; i<norder; i++) {    for (imotif=0; imotif < nmotifs; imotif++) {      if (order[i] == los[imotif]->name) break;	/* found motif name */    }    order[i] = imotif;				/* internal name for motif */  }  /* shuffle the columns of each motif matrix if requested */  if (shuffle) shuffle_cols(los, nmotifs);  /* compute the pairwise motif correlations (similarities) */  motif_corr(nmotifs, los);   /* print motif correlations */  if (debug) {    for (i=1; i<nmotifs; i++) {      int j;      printf("SCAL %2d", i+1);      for (j=0; j<i; j++) {        printf(" %8.2f", los[i]->corr[j]);      }      printf("\n");    }  }  /* return number of motifs remaining in (compacted) los array */  return nmotifs;} /* get_motifs *//**********************************************************************//*	print_annotation	Print the annotation section of MAST output.	Format:		<Sequence Description line>+		<length> <combine p-value> <E-value>		<block diagram>		[[<strand>]<motif number>[<frame>]		 <position p-value>		 <best possible match>		 <per-letter positive score markers>		 [protein translation of motif if xlate_dna]		 <sequence>		]+        Hits:		strong 		p-value < thresh 		weak		otherwise*//**********************************************************************/static void print_annotation(  BOOLEAN dna,				/* database is DNA */  STYPE stype,				/* handling of DNA strands */  BOOLEAN xlate_dna,			/* database is DNA and motifs protein */  double thresh,			/* threshold for strong hits */  FILE *note_file,			/* table of annotated sequences */  LO *los[],				/* array of pointers to lo matrices */  char *sequence,			/* sequence of sample */  long length,				/* length of sample */  char *sample_name,			/* name of sample */  char *id,				/* identifier text for sample */  int strand,				/* strand of DNA or 0 if protein */  double pvalue,			/* combined p-value of sequence */  double evalue,			/* combined E-value of sequence */  TILING tiling				/* tiling and diagram of sequence */){  long i, j, k;  int gb;				/* no good break yet */  int nw;				/* width of sequence number */  int lw;				/* amount of seq per line */  char *s;				/* pointer to start of id */  int idlines = 0;			/* number of id lines printed */  char *protalph = PROTEINB;		/* protein alphabet */  char *bstring = NULL;			/* hit motif number string */  char *pstring = NULL;			/* hit p-value string */  char *cstring = NULL;			/* hit consensus string */  char *mstring = NULL;			/* hit match string */  char *xstring = NULL;			/* hit match string */  char *sstr = stype!=Separate ? "" :     (strand==-1 ? " (- strand)" : " (+ strand)");  int inc = (xlate_dna ? 3 : 1);	/* increment in sequence */  /*    Allocate space for annotation strings.  */  Resize(bstring, 2*length, char);  Resize(pstring, 2*length, char);  Resize(cstring, 2*length, char);  Re

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?