📄 histogram.c
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* histogram.c * SRE, Sat Jan 20 16:16:17 1996 * * Accumulation, printing, and fitting of score histograms * from database searches. * * RCS $Id: histogram.c,v 1.17 1999/12/04 22:42:28 eddy Exp $ ************************************************************ * Basic API: * * struct histogram_s *h; * * h = AllocHistogram(min_hint, max_hint, lumpsize); * * while (getting scores x) AddToHistogram(h, x); * * ExtremeValueFitHistogram(h, high_hint); * PrintASCIIHistogram(fp, h); * FreeHistogram(h); */#include <stdio.h>#include <string.h>#include <limits.h>#include <float.h>#include <math.h>#include "squid.h"#include "config.h"#include "structs.h"#include "funcs.h"#ifdef MEMDEBUG#include "dbmalloc.h"#endif/* Function: AllocHistogram() * * Purpose: Allocate and return a histogram structure. * min and max are your best guess. They need * not be absolutely correct; the histogram * will expand dynamically to accomodate scores * that exceed these suggested bounds. The amount * that the histogram grows by is set by "lumpsize". * * Args: min: minimum score (integer) * max: maximum score (integer) * lumpsize: when reallocating histogram, pad the reallocation * by this much (saves excessive reallocation) */struct histogram_s *AllocHistogram(int min, int max, int lumpsize){ struct histogram_s *h; int newsize; int i; newsize = max - min + 1; h = (struct histogram_s *) MallocOrDie(sizeof(struct histogram_s)); h->min = min; h->max = max; h->total = 0; h->lowscore = INT_MAX; h->highscore = INT_MIN; h->lumpsize = lumpsize; h->histogram = (int *) MallocOrDie (sizeof(int) * newsize); for (i = 0; i < newsize; i++) h->histogram[i] = 0; h->expect = NULL; h->fit_type = HISTFIT_NONE; return h;}/* Function: FreeHistogram() * * Purpose: free a histogram structure. */voidFreeHistogram(struct histogram_s *h){ free(h->histogram); if (h->expect != NULL) free(h->expect); free(h);} /* Function: UnfitHistogram() * * Purpose: Free only the theoretical fit part of a histogram. */voidUnfitHistogram(struct histogram_s *h){ if (h->expect != NULL) free(h->expect); h->expect = NULL; h->fit_type = HISTFIT_NONE;}/* Function: AddToHistogram() * * Purpose: Bump the appropriate counter in a histogram * structure, given a score. The score is * rounded off from float precision to the * next lower integer. */voidAddToHistogram(struct histogram_s *h, float sc){ int score; int moveby; int prevsize; int newsize; int i; /* Adding to a histogram conflicts with existing fit: * prohibit this. */ if (h->fit_type != HISTFIT_NONE) Die("AddToHistogram(): Can't add to a fitted histogram\n"); /* histogram bins are defined as: score >= bin value, < bin+1 * -1.9 -> -2 -0.4 -> -1 1.9 -> 1 * -2.1 -> -3 0.4 -> 0 2.1 -> 2 */ score = (int) floor(sc); /* Check to see if we must reallocate the histogram. */ if (score < h->min) { prevsize = h->max - h->min + 1; moveby = (h->min - score) + h->lumpsize; newsize = prevsize + moveby; h->min -= moveby; h->histogram = (int *) ReallocOrDie(h->histogram, sizeof(int) * newsize); memmove(h->histogram+moveby, h->histogram, sizeof(int) * prevsize); for (i = 0; i < moveby; i++) h->histogram[i] = 0; } else if (score > h->max) { prevsize = h->max - h->min + 1; h->max = h->lumpsize + score; newsize = h->max - h->min + 1; h->histogram = (int *) ReallocOrDie(h->histogram, sizeof(int) * newsize); for (i = prevsize; i < newsize; i++) h->histogram[i] = 0; } /* Bump the correct bin. * The bin number is score - h->min */ h->histogram[score - h->min]++; h->total++; if (score < h->lowscore) h->lowscore = score; if (score > h->highscore) h->highscore = score; SQD_DPRINTF3(("AddToHistogram(): added %.1f; rounded to %d; in bin %d (%d-%d)\n", sc, score, score-h->min, h->min, h->max)); return;}/* Function: PrintASCIIHistogram() * * Purpose: Print a "prettified" histogram to a file pointer. * Deliberately a look-and-feel clone of Bill Pearson's * excellent FASTA output. * * Args: fp - open file to print to (stdout works) * h - histogram to print */ voidPrintASCIIHistogram(FILE *fp, struct histogram_s *h){ int units; int maxbar; int num; int i, idx; char buffer[81]; /* output line buffer */ int pos; /* position in output line buffer */ int lowbound, lowcount; /* cutoffs on the low side */ int highbound, highcount; /* cutoffs on the high side */ int emptybins = 3; /* Find out how we'll scale the histogram. * We have 59 characters to play with on a * standard 80-column terminal display: * leading "%5d %6d %6d|" occupies 20 chars. * Save the peak position, we'll use it later. */ maxbar = 0; for (i = h->lowscore - h->min; i <= h->highscore - h->min; i++) if (h->histogram[i] > maxbar) { maxbar = h->histogram[i]; /* max height */ lowbound = i + h->min; /* peak position */ } /* Truncate histogram display on both sides, ad hoc fashion. * Start from the peak; then move out until we see <emptybins> empty bins, * and stop. */ highbound = lowbound; /* start at peak position */ for (num = 0; lowbound > h->lowscore; lowbound--) { i = lowbound - h->min; if (h->histogram[i] > 0) { num = 0; continue; } /* reset */ if (++num == emptybins) { lowbound += emptybins; break; } /* stop */ } for (num = 0; highbound < h->highscore; highbound++) { i = highbound - h->min; if (h->histogram[i] > 0) { num = 0; continue; } /* reset */ if (++num == emptybins) { highbound -= emptybins; break; } /* stop */ } /* collect counts outside of bounds */ for (lowcount = 0, i = h->lowscore - h->min; i <= lowbound - h->min; i++) lowcount += h->histogram[i]; for (highcount = 0, i = h->highscore - h->min; i >= highbound - h->min; i--) highcount += h->histogram[i]; /* maxbar might need raised now; then set our units */ if (lowcount > maxbar) maxbar = lowcount; if (highcount > maxbar) maxbar = highcount; units = ((maxbar-1)/ 59) + 1; /* Print the histogram */ fprintf(fp, "%5s %6s %6s (one = represents %d sequences)\n", "score", "obs", "exp", units); fprintf(fp, "%5s %6s %6s\n", "-----", "---", "---"); buffer[80] = '\0'; buffer[79] = '\n'; for (i = h->lowscore; i <= h->highscore; i++) { memset(buffer, ' ', 79 * sizeof(char)); idx = i - h->min; /* Deal with special cases at edges */ if (i < lowbound) continue; else if (i > highbound) continue; else if (i == lowbound && i != h->lowscore) { sprintf(buffer, "<%4d %6d %6s|", i+1, lowcount, "-"); if (lowcount > 0) { num = 1+(lowcount-1) / units; if (num > 60) Die("oops"); for (pos = 20; num > 0; num--) buffer[pos++] = '='; } fputs(buffer, fp); continue; } else if (i == highbound && i != h->highscore) { sprintf(buffer, ">%4d %6d %6s|", i, highcount, "-"); if (highcount > 0) { num = 1+(highcount-1) / units; for (pos = 20; num > 0; num--) buffer[pos++] = '='; } fputs(buffer, fp); continue; } /* Deal with most cases */ if (h->fit_type != HISTFIT_NONE) sprintf(buffer, "%5d %6d %6d|", i, h->histogram[idx], (int) h->expect[idx]); else sprintf(buffer, "%5d %6d %6s|", i, h->histogram[idx], "-"); buffer[20] = ' '; /* sprintf writes a null char */ /* Mark the histogram bar for observed hits */ if (h->histogram[idx] > 0) { num = 1 + (h->histogram[idx]-1) / units; for (pos = 20; num > 0; num--) buffer[pos++] = '='; } /* Mark the theoretically expected value */ if (h->fit_type != HISTFIT_NONE && (int) h->expect[idx] > 0) { pos = 20 + (int)(h->expect[idx]-1) / units; if (pos >= 78) pos = 78; /* be careful of buffer bounds */ buffer[pos] = '*'; } /* Print the line */ fputs(buffer, fp); } /* Print details about the statistics */ switch (h->fit_type) { case HISTFIT_NONE: fprintf(fp, "\n\n%% No statistical fit available\n"); break; case HISTFIT_EVD: fprintf(fp, "\n\n%% Statistical details of theoretical EVD fit:\n"); fprintf(fp, " mu = %10.4f\n", h->param[EVD_MU]); fprintf(fp, " lambda = %10.4f\n", h->param[EVD_LAMBDA]); fprintf(fp, "chi-sq statistic = %10.4f\n", h->chisq); fprintf(fp, " P(chi-square) = %10.4g\n", h->chip); break; case HISTFIT_GAUSSIAN: fprintf(fp, "\n\n%% Statistical details of theoretical Gaussian fit:\n"); fprintf(fp, " mean = %10.4f\n", h->param[GAUSS_MEAN]); fprintf(fp, " sd = %10.4f\n", h->param[GAUSS_SD]); fprintf(fp, "chi-sq statistic = %10.4f\n", h->chisq); fprintf(fp, " P(chi-square) = %10.4g\n", h->chip); break; } return;} /* Function: PrintXMGRHistogram() * Date: SRE, Wed Nov 12 11:02:00 1997 [St. Louis] * * Purpose: Print an XMGR data file that contains two data sets: * - xy data for the observed histogram * - xy data for the theoretical histogram */voidPrintXMGRHistogram(FILE *fp, struct histogram_s *h){ int sc; /* integer score in histogram structure */ double val; /* First data set is the observed histogram */ for (sc = h->lowscore; sc <= h->highscore; sc++) if (h->histogram[sc - h->min] > 0) fprintf(fp, "%-6d %f\n", sc, (float) h->histogram[sc - h->min]/ (float) h->total); fprintf(fp, "&\n"); /* Second data set is the theoretical histogram */ if (h->fit_type != HISTFIT_NONE) { for (sc = h->lowscore; sc <= h->highscore; sc++) { val = (1. - ExtremeValueP((float)sc+1, h->param[EVD_MU], h->param[EVD_LAMBDA]))- (1. - ExtremeValueP((float)sc, h->param[EVD_MU], h->param[EVD_LAMBDA])); fprintf(fp, "%-6d %f\n", sc, val); } fprintf(fp, "&\n"); }}/* Function: PrintXMGRDistribution() * Date: SRE, Wed Nov 12 11:02:09 1997 [St. Louis] * * Purpose: Print an XMGR data file that contains two data sets: * - xy data for the observed distribution P(S<x) * - xy data for the theoretical distribution P(S<x) */voidPrintXMGRDistribution(FILE *fp, struct histogram_s *h){ int sc; /* integer score in histogram structure */ int cum; /* cumulative count */ double val; /* First data set is the observed distribution; * histogram bin x contains # of scores between x and x+1, * hence the sc+1 offset. */ for (cum = 0, sc = h->lowscore; sc <= h->highscore; sc++) { cum += h->histogram[sc - h->min]; fprintf(fp, "%-6d %f\n", sc + 1, (float) cum / (float) h->total); } fprintf(fp, "&\n"); /* Second data set is the theoretical histogram */ if (h->fit_type != HISTFIT_NONE) { for (sc = h->lowscore; sc <= h->highscore; sc++) { val = (1. - ExtremeValueP((float) sc, h->param[EVD_MU], h->param[EVD_LAMBDA])); fprintf(fp, "%-6d %f\n", sc, val); } fprintf(fp, "&\n"); }}/* Function: PrintXMGRRegressionLine() * Date: SRE, Wed Nov 12 11:02:19 1997 [St. Louis] * * Purpose: Print an XMGR data file that contains two data sets: * - xy data for log log transform of observed distribution P(S<x) * - xy data for log log transform of theoretical distribution P(S<x) */voidPrintXMGRRegressionLine(FILE *fp, struct histogram_s *h){ int sc; /* integer score in histogram structure */ int cum; double val; /* log log transform */ /* First data set is the observed distribution; * histogram bin x contains # of scores between x and x+1, * hence the sc+1 offset. */ for (cum = 0, sc = h->lowscore; sc <= h->highscore; sc++) { cum += h->histogram[sc - h->min]; val = log (-1. * log((double) cum / (double) h->total)); if (cum < h->total) fprintf(fp, "%-6d %f\n", sc + 1, val); } fprintf(fp, "&\n"); /* Second data set is the theoretical histogram */ if (h->fit_type != HISTFIT_NONE) { for (sc = h->lowscore; sc <= h->highscore; sc++) { val = log(-1. * log(1. - ExtremeValueP((float) sc, h->param[EVD_MU], h->param[EVD_LAMBDA]))); fprintf(fp, "%-6d %f\n", sc, val); } fprintf(fp, "&\n"); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -