mast-util.c

来自「EM算法的改进」· C语言代码 · 共 815 行 · 第 1/2 页
815 行
    char *h = ((j==0) ? hdr : " ");		/* current header */    dlen = PAGEWIDTH - hlen - 6;    if (remain <= PAGEWIDTH - hlen) dlen = remain;    fprintf(file, "%-*.*s%.*s", hlen, hlen, h, dlen, dia+j);    j += dlen;    /* continue printing until a good breaking point */    while (j < dia_len && dia[j-1] != '_' && dia[j-1] != ',') putc(dia[j++], file);    putc('\n', file);  }} /* print_diagram *//**********************************************************************//*	score_it    	Score the sequence with each motif.	Returns the array of scores:		scores[i][j].score = score of motif i at position j		scores[i][j].ic = score on - strand (set to FALSE here)*//**********************************************************************/static SCORE **score_it(  BOOLEAN xlate_dna,	/* database is DNA and motifs protein */  LO *los[], 		/* array of pointers to log-odds matrices */  int nmotifs, 		/* number of motifs */  char *sequence,	/* sequence */  long length 		/* length of the sequence */){  int imotif;  long j, k;  SCORE **scores = NULL;	/* scores[i][j].score motif i at offset j */  int *hash_seq = NULL;		/* hashed sequence */  /*     Hash sequence to index in alphabet.  */  hash_seq = dhash_it(xlate_dna, los[0]->alen, sequence, length);  /*     Create scores array.   */  Resize(scores, nmotifs, SCORE *);  /*     Score the sequence with each motif.   */  for (imotif=0; imotif<nmotifs; imotif++) {	/* motif */    LO *lo = los[imotif];		/* array of motifs */    LOGODDS2 logodds2 = lo->logodds2;	/* motif matrix */    int r = (lo->w+1)/2;		/* number of rows in the matrix */    int ws = lo->ws;			/* width the motif in the sequence */    int inc1 = 1;			/* increment to next site */    int inc2 = 2*(xlate_dna ? 3 : 1);	/* increment to next hashed column */    /* create array of scores unless sequence shorter than motif */    scores[imotif] = NULL;    if (ws > length) {      continue;				/* skip this motif */    } else {      Resize(scores[imotif], length, SCORE);    }    /*       Score each subsequence with the current motif.    */    for (j=0; j<=length-ws; j+=inc1) {	/* position */      int *h;				/* pointer in hash_seq */      int score = 0;			/* subsequence score */      /* motif score (positive motif) */      for (k=0, h=hash_seq+j; k<r; k++, h+=inc2) score += logodds2(k, *h);      /* 3-class score */      if (lo->pair) {        int neg = 0;			/* subsequence score, neg motif */        double score3;			/* 3-class bit score */        for (k=r, h=hash_seq+j; k<2*r; k++, h+=inc2) neg += logodds2(k, *h);        score3 = score3class(score, neg, lo);        score = bit_to_scaled(score3, 1, lo->scale3, lo->offset3);      }      scores[imotif][j].score = score;      scores[imotif][j].ic = FALSE;    } /* position */  } /* imotif */  myfree(hash_seq);  return scores;} /* score_it *//**********************************************************************//*	best_hit	Determine whether a given motif occurrence has the lowest p-value	for that motif.*//**********************************************************************/static BOOLEAN best_hit(  int index, 		/* Position of the given motif. */  int motif,		/* Motif number. */  int *hits,		/* Motif indices. */  double *pvalues,	/* Array of pvalues. */  long length		/* Length of pvalue array. */){  long i;  double lowest_pvalue = 1.0;  int best_index = 0;  for (i = 0; i < length; i++) {    int m = abs(hits[i])-1;			/* motif of hit */    if ((m == motif) && (pvalues[i] < lowest_pvalue)) {      lowest_pvalue = pvalues[i];      best_index = i;    }  }  return (best_index == index);} /* best_hit */ /**********************************************************************//*	make_block	Create a block string:		[smf(p)] or <smf(p)>			s 	strand (optional)			m 	motif			f 	frame (optional)			(p)	p-value (optional)	*//**********************************************************************/extern void make_block(  int m,			/* motif number */  char *strand,			/* strand */  int f,			/* frame number; f=0 not translating DNA */  double thresh,		/* strong motif threshold */  double p,			/* p-value */  BOOLEAN print_p,		/* print p-value in block if TRUE */  char *block			/* put block string here */)	{  char left = p < thresh ? '[' : '<';  char right = p < thresh ? ']' : '>';  BOOLEAN xlate_dna = (f != 0);  char *fnames = "abc";					/* frame 1=a, 2=b, 3=c*/  if (print_p) {					/* print p-value */    char *bfmt = f ? "%c%s%d%c(%8.2e)%c" : "%c%s%d(%8.2e)%c";    if (xlate_dna) {					/* str., motif, frame */      sprintf(block, bfmt, left, strand, m, fnames[f-1], p, right);    } else {						/* strand, motif */      sprintf(block, bfmt, left, strand, m, p, right);    }  } else {						/* don't print p-value */    char *bfmt = f ? "%c%s%d%c%c" : "%c%s%d%c";    if (xlate_dna) {					/* str., motif, frame */      sprintf(block, bfmt, left, strand, m, fnames[f-1], right);    } else {						/* strand, motif */      sprintf(block, bfmt, left, strand, m, right);    }  }} /* make_block *//**********************************************************************//*	score_tile_diagram	Score a sequence, tile it with motif occurrences and create	a block diagram string.	Returns the tiling containing the hits, their position p-values,	and the p-value of the product of p-values for the best hits.*//**********************************************************************/extern TILING score_tile_diagram(  char *sequence,			/* sequence to score and tile */  long length,				/* length of sequence */  LO *los[],				/* array of pointers to lo matrices */  int nmotifs,				/* number motifs read */  BOOLEAN dna,                          /* database is DNA */  STYPE stype,				/* handling of different strands */  BOOLEAN xlate_dna,			/* database is DNA and motifs protein */  BOOLEAN best_motifs,                  /* show only best motifs in diagrams */  BOOLEAN print_p,			/* print p-value in block */  double **pv,				/* p-value tables for each motif */  double m_thresh,			/* maximum motif p-value to print */  double w_thresh,			/* max. motif p-value for weak hits */  BOOLEAN use_seq_p, 			/* use sequence not position p-values */  BOOLEAN hit_list, 			/* create hit list instead of diagram */  char *name				/* name of sequence; only used if hit_list=TRUE */){  SCORE **scores;			/* scores for each motif vs seq. pos. */  TILING tiling;			/* tiling and diagram of sequence */  /*    score the sequence with each of the motifs  */  scores = score_sequence(stype, xlate_dna, sequence, length, nmotifs, los);  /*    mark the non-overlapping motif positions  */  tiling = tile_sequence(stype, pv, w_thresh, use_seq_p, los, nmotifs, length,    scores);  /*    add the block diagram to the tiling structure  */  tiling.diagram = create_diagram(dna, stype, xlate_dna, best_motifs, print_p,    m_thresh, nmotifs, los, length, hit_list, name, tiling);  free_2array(scores, nmotifs);  return(tiling);} /* score_tile_diagram *//**********************************************************************//*	qfast		Calculate the p-value of the product of uniform [0,1] random	variables.*//**********************************************************************/extern double qfast(  int n,			/* number of random variables in product */  double k			/* product of random variables */){  int i = 1;  double mlnk, term, phi;   if (n == 0) return 1.0;	/* worst possible p-value */  if (k == 0) return 0.0;	/* p-value is 0 */  mlnk = -log(k);   phi = term = k;  for (i=1; i<n; i++) {    term *= mlnk/i;    phi += term;  }  return phi;} /* qfast *//**********************************************************************//*	init_mast_background	Determine the BLAST alphabet corresponding to the input alphabet.	Set up the hashing functions for the alphabet(s).	Read in the background frequencies for letters (or use	non-redundant database frequencies if no file name given).	The background frequency file should contain frequencies	for the DNA0 or PROTEIN0 alphabet in the format required	by read_markov model.	Sets		blast_alphabet :	the corresponding BLAST alphabet		p :			permutation/substitution matrix					from alphabet to blast_alphabet	Returns the background frequencies for the BLAST alphabet.*//**********************************************************************/extern double *init_mast_background(  char *bfile,				/* name of background file */  char *alphabet,			/* motif alphabet */  STYPE stype,				/* handling of DNA strands */  BOOLEAN translate_dna,		/* DNA sequences and protein motifs */  char **blast_alphabet,		/* corresponding BLAST alphabet */  int *p[MAXASCII]			/* permutation/substitution matrix */){  int i;  int alen;  double *back=NULL, *tmp;  char *bfile_alpha;				/* alphabet in bfile */  int order;					/* order of background */  /* average reverse complement frequencies */  BOOLEAN rc = (stype==Separate || stype==Combine);	  /*     Set up the hashing functions for mapping between letters or codons and    alphabet positions.  */  setup_hash_alph(DNAB);			/* DNAB to position hashing */  setup_hash_alph(PROTEINB);			/* PROTEINB to position hash */  /*     Determine the alphabet used by the motifs.    Make a permutation/substitution mapping from that alphabet to one    of the BLAST alphabets.  */  *blast_alphabet = get_blast_alphabet(alphabet, p);  alen = strlen(*blast_alphabet);  /*    Set up to translate DNAB to PROTEINB if requested.  */  if (translate_dna) {    if (strcmp(*blast_alphabet, PROTEINB)) {	/* motifs must be protein */      fprintf(stderr, "\nThe -dna switch requires protein motifs.  ");      fprintf(stderr, "Your motif(s) use a DNA alphabet.\n");      exit(1);    }    setup_hash_dnab2protb();			/* DNAB to PROTEINB hashing */  }  /*     Determine the type of motif alphabet and set it as current  */  if (!strcmp(*blast_alphabet, DNAB)) {	/* searching DNA with DNA motifs */    tmp =  ntfreq;    bfile_alpha = DNA0;    setalph(0);				/* DNAB alphabet */  } else {    if (translate_dna) {		/* searching DNA with protein motifs */      tmp = frame0;    } else {				/* searching protein with protein mtfs*/      tmp = nrfreq;      rc = FALSE;			/* no reverse complement! */    }    bfile_alpha = PROTEIN0;    setalph(1);				/* PROTEINB alphabet */  }  /*    Read in the background probabilities from a Markov model file     using the basic alphabet.  Convert to the BLAST alphabet.  */  Resize(back, alen, double);			/* create results array */  if (bfile) {                                  /* use bfile frequencies */    int alenb = strlen(bfile_alpha);		/* length of '0' alphabet */    tmp = read_markov_model(bfile, NULL, bfile_alpha, FALSE, rc, &order);    for (i=0; i<alen; i++) back[i] = 0;    for (i=0; i<alenb; i++) back[hash(bfile_alpha[i])] = tmp[i];  } else {    for (i=0; i<alen; i++) back[i] = tmp[i];  }  return(back);} /* init_mast_background *//**********************************************************************//*	free_tiling	Free a tiling structure.*//**********************************************************************/extern void free_tiling(  TILING tiling) {    myfree(tiling.pvalues);    myfree(tiling.diagram);} /* free_tiling *//**********************************************************************//*	get_seq_comp	Get the letter frequencies in a sequence.	Doesn't take into account reverse complement.*//**********************************************************************/double *get_seq_comp(  BOOLEAN xlate_dna,			/* translate DNA */  char *sequence,			/* ASCII sequence */  int alen				/* length of alphabet */){  int i, n;  double *freq = NULL;  /* create the frequency array */   Resize(freq, alen, double);  for (i=0; i<alen; i++) freq[i] = 0;  /*for (i=0; i<alen; i++) freq[i] = 1;*/		/* add one prior */  /* count the number of letters of each type */  for (n=0; sequence[n]; n++) {    i = chash(xlate_dna, FALSE, sequence+n);	/* hash letter */    freq[i]++;  }  /* convert counts to frequencies */  for (i=0; i<alen; i++) freq[i] = n ? freq[i]/n : 1.0;  /*for (i=0; i<alen; i++) freq[i] = n ? freq[i]/(n+alength) : 1.0;*/ 	/* prior */  /* return the frequency matrix */  return(freq);} /*  get_seq_comp */
mast-util.c - 源码说明

本页面展示了「EM算法的改进」中的 mast-util.c 源码文件，采用 C语言编程语言编写，共 815 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与法的改进相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?