mast.c

来自「EM算法的改进」· C语言 代码 · 共 1,794 行 · 第 1/5 页

C
1,794
字号
      fs1, fs2, m_thresh, w_thresh, fs3, fs4);    printf("%s\n\n", stars);  } /* doc */  /*    print table of diagrams headings and data  */  {    char *st1 = stype==Separate ? "STRAND" : "";	/* strand heading */    char *st2 = stype==Separate ? "------" : "";	/* strand underline */    int nl = (*st1=='\0') ? MMSN : MMSN-4;		/* length of name */    char *f = "%-*s%s %8s  %s\n";			/* format */    printf(f, nl, "SEQUENCE NAME", st1, "E-VALUE ", "MOTIF DIAGRAM");    printf(f, nl, "-------------", st2, "--------", "-------------");    copy_file(diag_file, stdout);    printf("\n%s\n\n", stars);  }  /*    print table of annotated sequences documentation and data  */  if (note_file) {    printf("\n\n%s\n", stars);    printf("SECTION III: ANNOTATED SEQUENCES\n");    printf("%s\n", stars);    if (doc) {      char *fs1 = xlate_dna ? "/frame" : "";		/* frame string 3 */      char *fs2 = stype==Combine ?           " (a minus sign indicates that\n\t  the occurrence is on the reverse complement strand)" :           "";      char *fs3 = xlate_dna && stype==Combine ?           " (or its reverse)" :         stype==Combine ?           " (or its reverse complement)" :           "";      char *fs4 = xlate_dna ? "," : ", and";		/* frame string 4 */      char *fs5 = xlate_dna && stype==Combine ? 	  ", and\n\t   o the protein translation of the match (or its reverse).\n" :          xlate_dna ? 	  ", and\n\t   o the protein translation of the match.\n" :          ".\n";      char *ptype = use_seq_p ? "SEQUENCE" : "POSITION";      printf("\t- The positions and p-values of the non-overlapping motif occurrences\n""\t  are shown above the actual sequence for each of the high-scoring\n""\t  sequences from Section I.\n""\t- A motif occurrence is defined as a position in the sequence whose\n""\t  match to the motif has %s p-value less than %g as \n""\t  defined in Section II.\n""\t- For each sequence, the first line specifies the name of the sequence.\n""\t- The second (and possibly more) lines give a description of the \n""\t  sequence.\n""\t- Following the description line(s) is a line giving the length, \n""\t  combined p-value, and E-value of the sequence as defined in Section I.\n""\t- The next line reproduces the motif diagram from Section II.\n""\t- The entire sequence is printed on the following lines.\n""\t- Motif occurrences are indicated directly above their positions in the\n""\t  sequence on lines showing\n""\t   o the motif number%s of the occurrence%s,\n""\t   o the position p-value of the occurrence,\n""\t   o the best possible match to the motif%s%s\n""\t   o columns whose match to the motif has a positive score (indicated \n""\t     by a plus sign)%s",      ptype, w_thresh, fs1, fs2, fs3, fs4, fs5);      printf("%s\n", stars);    } /* doc */    copy_file(note_file, stdout);    printf("\n%s\n\n", stars);  } /* annotation table */  /* display elapsed time */  fflush(stdout);  system("echo ''; echo CPU: `hostname`;");  printf("Time %f secs.\n\n", myclock()/1E6);} /* print_mast_results *//**********************************************************************//*	make_mast_tables	Make the three MAST tables in temporary files.	Returns the number of sequences less than e_max.*//**********************************************************************/static int make_mast_tables(  BOOLEAN dna,				/* database is DNA */  STYPE stype,				/* handling of different strands */  BOOLEAN xlate_dna,			/* database is DNA and motifs protein */  BOOLEAN use_seq_comp,			/* compensate for seq composition */  BOOLEAN best_motifs, 			/* diagrams have only best motif */  int kmotifs,	 			/* number of known motifs */  MOTIF motif[],			/* data returned by read_motifs */  char *motif_file,			/* motif file name */  FILE *fdata,				/* the database file */  int nseqs,				/* number of sequences scored */  SORT_SEQ *seqs,			/* sortable array of seq/score records*/  int n_hits,				/* size of seqs array */  FILE* hit_file,			/* table of hits */  FILE* diag_file,			/* table of motif diagrams */  FILE* note_file,			/* table of annotated sequences */  int rank,				/* rank of first result to print */  int smax,				/* maximum sequences to print*/  double e_max,				/* maximum E-value to print */  double m_thresh,			/* maximum motif p-value to print */  double w_thresh,	 		/* maximum motif p-value-- weak hits */  BOOLEAN use_seq_p,			/* use sequence not position p-values */  BOOLEAN hit_list,                     /* create hit list instead of diagram */  LO *los[],				/* array of pointers to lo matrices */  int nmotifs 				/* number motifs read */ ){  int i;  int seqno; 				/* seq. number in sorted sequences */  long length;				/* length of sample */  char *sample_name;			/* name of sample */  char *sequence;			/* sequence of sample */  char *id;				/* identifier text for sample */  int pseqs = 0;			/* number of sequences printed */  if (hit_list) {    printf("# All non-overlapping hits in all sequences with E-values <= %g.\n", e_max);    printf("# sequence_name motif hit_start hit_end hit_p-value\n");  }  for (seqno=0; seqno < n_hits; seqno++) {    SORT_SEQ *seq = seqs + seqno;		/* next seq */    TILING tiling;				/* tiling of sequence */    int strand = seq->strand;			/* -1 neg, 0 both, +1 pos */    double **pv = seq->pv;			/* pvalue tables */    double pvalue = seq->Pvalue;		/* combined p-value of seq  */    double evalue = nseqs * pvalue;		/* combined E-value of seq  */    char name[1000];				/* name with gi|...| removed */    /* done if E-value above threshold; rest have larger p-values */    if (evalue > e_max) break;    /* done if maximum number of sequences reached */    if (smax && pseqs >= smax) break;    /* skip if rank not reached */    if (seqno+1 < rank) continue;    pseqs++;    fseek(fdata, seq->fp, 0);			/* go to start of sequence */    read_sequence(fdata, &sample_name, &id, &sequence, &length);    strncpy(name, sample_name, 999);    /*      convert to reverse complement if negative strand of DNA    */    if (strand == -1) invcomp_dna(sequence, length);    /*       score, tile and diagram the sequence with each of the motifs    */    tiling = score_tile_diagram(sequence, length, los, nmotifs, dna, stype,       xlate_dna, best_motifs, FALSE, pv, m_thresh, w_thresh, use_seq_p, hit_list, name);    /*       Print the hit in the hits section.     */    if (hit_file != NULL) {      char *kp = !kmotifs ? "" : (seq->known_pos ? "+ " : "- ");  /* +/- */      int nl = !kmotifs ? MMSN : MMSN - 2;		/* length of name */      char *frame = xlate_dna ? best_frame(nmotifs, length, tiling) : "";      char *st = stype==Separate ? (strand==-1 ? " -" : " +") : "";       int il = (*st=='\0') ? MAXID : MAXID-2;		/* length of id */      char *elipsis = strlen(id)>il ? "... " : "   ";      if (*frame!='\0') il -= 2;      fprintf(hit_file, "%s%-*.*s %-*.*s%3s%s%s  %8.2g %6ld\n", kp, nl, nl,         name, il, il, id, elipsis, st, frame, evalue, length);    } /* print hit */    /*       Print the motif diagram in the diagrams section.    */    if (diag_file != NULL) {      char hdr[80];						/* header */      char *st = stype!=Separate ? "" : (strand==-1 ? " -" : " +");       sprintf(hdr, "%-*.*s%s %8.2g  ", MMSN, MMSN, name, st, evalue);      print_diagram(tiling.diagram, hdr, diag_file);    }    /*       Print the hit list straight to standard output.    */    if (hit_list) {			/* print hits straight to stdout */      if (tiling.diagram) printf("%s", tiling.diagram);      //print_diagram(tiling.diagram, "", stdout);    }    /*       Print annotated sequence in the annotation section.    */    if (note_file) print_annotation(dna, stype, xlate_dna, m_thresh, note_file,      los, sequence, length, sample_name, id, strand, pvalue, evalue, tiling);    /*       free space     */    myfree(sample_name);    myfree(id);    myfree(sequence);    free_tiling(tiling);    if (use_seq_comp) {      for (i=0; i<nmotifs; i++) myfree(seq->pv[i]);      myfree(seq->pv);    }  } /* sequence */  return(pseqs);} /* make_mast_tables *//**********************************************************************//*	get_scores	Calculate the score and p-value for each sequence in the	database and sort them by p-value in ascending order. 	Returns sorted array of sequence-score records.*//**********************************************************************/static SORT_SEQ *get_scores(  BOOLEAN dna,		/* database is DNA */  STYPE stype,		/* handling of DNA strands */  BOOLEAN xlate_dna,	/* database is DNA and motifs protein */  double f[],		/* null letter probability distribution */  FILE *fdata,		/* the database */  FILE *fsave,		/* good sequences if database on stdin */  LO *los[],		/* array of pointers to log-odds matrices */  int nmotifs,		/* number of log-odds matrices in los */  int range, 		/* set entries in matrices to [0..range] */  double **pv,		/* p-value tables for each motif */  BOOLEAN sonly,	/* calculate p-value of observed spacings only */  BOOLEAN lump,		/* combine spacings into one p-value */  BOOLEAN use_seq_comp,	/* adjust E-value for actual sequence composition */  double e_max,		/* maximum sequence E-value to print */  int kmotifs,		/* number of known motifs*/  MOTIF motif[],	/* data returned by read_motifs */  BOOLEAN status,	/* show progress */  int min_seqs,		/* lower bound on nseqs */  int *nseqs,		/* total sequences in database */  double *residues,	/* total number of residues in seqs */  int *n_hits		/* sequences less than e_max */){  int i;   int imotif;  long fp = 0;				/* pointer in datafile */  char *sample_name;			/* name of sample */  char *sequence;			/* sequence of sample */  char *id;				/* identifier text for sample */  long length;				/* length of the sequence */  double *best_score = NULL;		/* best score per motif */  int *best_loc = NULL;			/* best location per motif */  SORT_SEQ *seqs = NULL;		/* sortable array of seq/score records*/  double pvalue;			/* expected number of sequences >= x */  SCORE **scores;			/* scores for each motif vs seq. pos. */  int strand = (stype==Separate || stype==Norc) ? 1 : 0;	/* current */  int alen = los[0]->alen;		/* length of motif alphabet */  BOOLEAN saved = FALSE;		/* sequence already saved */  *n_hits = *nseqs = *residues = 0;	/* actual and saved sequences */  while (1) {    /*       read next sequence unless doing - strand; break on EOF or error     */    if (strand != -1) {					/* not - strand */      fp = fsave ? ftell(fsave) : ftell(fdata);		/* save seq position */      if (!read_sequence(fdata, &sample_name, &id, &sequence, &length)) break;      saved = FALSE;					/* new sequence */    }    /*      create space for best scores and locations per motif unless they exist    */    if (!best_score) Resize(best_score, nmotifs, double);    if (!best_loc) Resize(best_loc, nmotifs, int);    /*       update size of database     */    if (stype==Separate && strand==1) {	/* treat each DNA sequence as two */      (*nseqs)+=2;			/* number of sequences in database */      *residues += 2*length;		/* number of residues in database */    } else if (stype!=Separate) {	/* treat each sequence as one */      (*nseqs)++;			/* number of sequences in database */      *residues += length;		/* number of residues in database */    }    /*      convert DNA to negative strand if just did the positive strand    */    if (strand == -1) invcomp_dna(sequence, length);    /*       compute the motif scores    */    scores = score_sequence(stype, xlate_dna, sequence, length, nmotifs, los);    /*      find best scoring position for each motif for the sequence    */    for (imotif=0; imotif<nmotifs; imotif++) {	/* motif */      long seq_pos;      int ws = los[imotif]->ws;			/* width of motif in sequence */      best_score[imotif] = LITTLE;      best_loc[imotif] = 0;      if (ws > length) continue;		/* skip if motif too long */      for (seq_pos=0; seq_pos<=length-ws; seq_pos++) {	/* position in sequence */	double s = scores[imotif][seq_pos].score;	if (s > best_score[imotif]) {	  best_score[imotif] = s;	  best_loc[imotif] = seq_pos;	}      } /* position */    } /* motif */    free_2array(scores, nmotifs);		/* free scores array */    /*       calculate the combined p-value for this sequence    */    pvalue = calc_p_value(sample_name, length, stype, nmotifs, los, best_score,       best_loc, range, pv, sonly, lump, norder, debug);    /*       save the sequence if its E-value may be under threshold     */    if (pvalue*(MAX(min_seqs, *nseqs)) < e_max) {       SORT_SEQ *seq;      /*         create sequence record for sorting       */      if (*n_hits % RCHUNK == 0) Resize(seqs, *n_hits+RCHUNK, SORT_SEQ);      seq = &seqs[(*n_hits)++];		/* record for sorting */      seq->id = sample_name;		/* name of sequence */      seq->fp = fp;			/* start of sequence record */      seq->length = length;		/* length of sequence */      seq->Pvalue = pvalue;		/* p-value of product of p-values */      seq->strand = strand;		/* strand of DNA or 0 if both */      seq->score = best_score;		/* best score per motif */      seq->loc = best_loc;		/* locations of best scores */      seq->comp = use_seq_comp ? get_seq_comp(xlate_dna, sequence, alen) : NULL;      seq->pv = pv;			/* pvalue tables for each motif */      best_score = NULL;      best_loc = NULL;			/* don't reuse the space! */      /*        write the sequence to the fsave file if reading standard input      */      if (fdata == stdin && !saved) {        fprintf(fsave, ">%s %s\n%s\n", sample_name, id, sequence);        saved = TRUE;			/* don't save this seq again */      }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?