📄 histogram.c
字号:
/* Function: EVDBasicFit() * Date: SRE, Wed Nov 12 11:02:27 1997 [St. Louis] * * Purpose: Fit a score histogram to the extreme value * distribution. Set the parameters lambda * and mu in the histogram structure. Fill in the * expected values in the histogram. Calculate * a chi-square test as a measure of goodness of fit. * * This is the basic version of ExtremeValueFitHistogram(), * in a nonrobust form: simple linear regression with no * outlier pruning. * * Methods: Uses a linear regression fitting method [Collins88,Lawless82] * * Args: h - histogram to fit * * Return: (void) */voidEVDBasicFit(struct histogram_s *h){ float *d; /* distribution P(S < x) */ float *x; /* x-axis of P(S<x) for Linefit() */ int hsize; int sum; int sc, idx; /* loop indices for score or score-h->min */ float slope, intercept; /* m,b fit from Linefit() */ float corr; /* correlation coeff of line fit, not used */ float lambda, mu; /* slope, intercept converted to EVD params */ /* Allocations for x, y axes * distribution d runs from min..max with indices 0..max-min * i.e. score - min = index into d, x, histogram, and expect */ hsize = h->highscore - h->lowscore + 1; d = (float *) MallocOrDie(sizeof(float) * hsize); x = (float *) MallocOrDie(sizeof(float) * hsize); for (idx = 0; idx < hsize; idx++) d[idx] = x[idx] = 0.; /* Calculate P(S < x) distribution from histogram. * note off-by-one of sc, because histogram bin contains scores between * x and x+1. */ sum = 0; for (sc = h->lowscore; sc <= h->highscore; sc++) { sum += h->histogram[sc - h->min]; d[sc - h->lowscore] = (float) sum / (float) h->total; x[sc - h->lowscore] = (float) (sc + 1); } /* Do a linear regression fit to the log[-log(P(S<x))] "line". * we have log[-log(1-P(S>x))] = -lambda * x + lambda * mu * so lambda = -m and mu = b/lambda */ /* convert y axis to log[-log(P(S<x))] */ for (sc = h->lowscore; sc < h->highscore; sc++) d[sc - h->lowscore] = log(-1. * log(d[sc - h->lowscore])); /* do the linear regression */ Linefit(x, d, hsize-1, &intercept, &slope, &corr); /* calc mu, lambda */ lambda = -1. * slope; mu = intercept / lambda; /* Set the EVD parameters in the histogram; * pass 2 for additional lost degrees of freedom because we fit mu, lambda. */ ExtremeValueSetHistogram(h, mu, lambda, h->lowscore, h->highscore, 2); free(x); free(d); return;}/* Function: ExtremeValueFitHistogram() * Date: SRE, Sat Nov 15 17:16:15 1997 [St. Louis] * * Purpose: Fit a score histogram to the extreme value * distribution. Set the parameters lambda * and mu in the histogram structure. Calculate * a chi-square test as a measure of goodness of fit. * * Methods: Uses a maximum likelihood method [Lawless82]. * Lower outliers are removed by censoring the data below the peak. * Upper outliers are removed iteratively using method * described by [Mott92]. * * Args: h - histogram to fit * censor - TRUE to censor data left of the peak * high_hint - score cutoff; above this are `real' hits that aren't fit * * Return: 1 if fit is judged to be valid. * else 0 if fit is invalid (too few seqs.) */intExtremeValueFitHistogram(struct histogram_s *h, int censor, float high_hint) { float *x; /* array of EVD samples to fit */ int *y; /* histogram counts */ int n; /* number of observed samples */ int z; /* number of censored samples */ int hsize; /* size of histogram */ float lambda, mu; /* new estimates of lambda, mu */ int sc; /* loop index for score */ int lowbound; /* lower bound of fitted region*/ int highbound; /* upper bound of fitted region*/ int new_highbound; int iteration; /* Determine lower bound on fitted region; * if we're censoring the data, choose the peak of the histogram. * if we're not, then we take the whole histogram. */ lowbound = h->lowscore; if (censor) { int max = -1; for (sc = h->lowscore; sc <= h->highscore; sc++) if (h->histogram[sc - h->min] > max) { max = h->histogram[sc - h->min]; lowbound = sc; } } /* Determine initial upper bound on fitted region. */ highbound = MIN(high_hint, h->highscore); /* Now, iteratively converge on our lambda, mu: */ for (iteration = 0; iteration < 100; iteration++) { /* Construct x, y vectors. */ x = NULL; y = NULL; hsize = highbound - lowbound + 1; if (hsize < 5) goto FITFAILED; /* require at least 5 bins or we don't fit */ x = MallocOrDie(sizeof(float) * hsize); y = MallocOrDie(sizeof(int) * hsize); n = 0; for (sc = lowbound; sc <= highbound; sc++) { x[sc-lowbound] = (float) sc + 0.5; /* crude, but tests OK */ y[sc-lowbound] = h->histogram[sc - h->min]; n += h->histogram[sc - h->min]; } if (n < 100) goto FITFAILED; /* require fitting to at least 100 points */ /* If we're censoring, estimate z, the number of censored guys * left of the bound. Our initial estimate is crudely that we're * missing e^-1 of the total distribution (which would be exact * if we censored exactly at mu; but we censored at the observed peak). * Subsequent estimates are more exact based on our current estimate of mu. */ if (censor) { if (iteration == 0) z = MIN(h->total-n, (int) (0.58198 * (float) n)); else { double psx; psx = EVDDistribution((float) lowbound, mu, lambda); z = MIN(h->total-n, (int) ((double) n * psx / (1. - psx))); } } /* Do an ML fit */ if (censor) { if (! EVDCensoredFit(x, y, hsize, z, (float) lowbound, &mu, &lambda)) goto FITFAILED; } else if (! EVDMaxLikelyFit(x, y, hsize, &mu, &lambda)) goto FITFAILED; /* Find the Eval = 1 point as a new highbound; * the total number of samples estimated to "belong" to the EVD is n+z */ new_highbound = (int) (mu - (log (-1. * log((double) (n+z-1) / (double)(n+z))) / lambda)); free(x); free(y); if (new_highbound >= highbound) break; highbound = new_highbound; } /* Set the histogram parameters; * - we fit from lowbound to highbound; thus we lose 2 degrees of freedom * for fitting mu, lambda, but we get 1 back because we're unnormalized * in this interval, hence we pass 2-1 = 1 as ndegrees. */ ExtremeValueSetHistogram(h, mu, lambda, lowbound, highbound, 1); return 1;FITFAILED: UnfitHistogram(h); if (x != NULL) free(x); if (y != NULL) free(y); return 0;} /* Function: ExtremeValueSetHistogram() * * Purpose: Instead of fitting the histogram to an EVD, * simply set the EVD parameters from an external source. * * Args: h - the histogram to set * mu - mu location parameter * lambda - lambda scale parameter * lowbound - low bound of the histogram that was fit * highbound- high bound of histogram that was fit * ndegrees - extra degrees of freedom to subtract in X^2 test: * typically 0 if mu, lambda are parametric, * else 2 if mu, lambda are estimated from data */voidExtremeValueSetHistogram(struct histogram_s *h, float mu, float lambda, float lowbound, float highbound, int ndegrees){ int sc; int hsize, idx; int nbins; float delta; UnfitHistogram(h); h->fit_type = HISTFIT_EVD; h->param[EVD_LAMBDA] = lambda; h->param[EVD_MU] = mu; hsize = h->max - h->min + 1; h->expect = (float *) MallocOrDie(sizeof(float) * hsize); for (idx = 0; idx < hsize; idx++) h->expect[idx] = 0.; /* Calculate the expected values for the histogram. */ for (sc = h->min; sc <= h->max; sc++) h->expect[sc - h->min] = ExtremeValueE((float)(sc), h->param[EVD_MU], h->param[EVD_LAMBDA], h->total) - ExtremeValueE((float)(sc+1), h->param[EVD_MU], h->param[EVD_LAMBDA], h->total); /* Calculate the goodness-of-fit (within whole region) */ h->chisq = 0.; nbins = 0; for (sc = lowbound; sc <= highbound; sc++) if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) { delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; h->chisq += delta * delta / h->expect[sc-h->min]; nbins++; } /* Since we fit the whole histogram, there is at least * one constraint on chi-square: the normalization to h->total. */ if (nbins > 1 + ndegrees) h->chip = (float) IncompleteGamma((double)(nbins-1-ndegrees)/2., (double) h->chisq/2.); else h->chip = 0.; }/* Function: GaussianFitHistogram() * * Purpose: Fit a score histogram to a Gaussian distribution. * Set the parameters mean and sd in the histogram * structure, as well as a chi-squared test for * goodness of fit. * * Args: h - histogram to fit * high_hint - score cutoff; above this are `real' hits that aren't fit * * Return: 1 if fit is judged to be valid. * else 0 if fit is invalid (too few seqs.) */intGaussianFitHistogram(struct histogram_s *h, float high_hint){ float sum; float sqsum; float delta; int sc; int nbins; int hsize, idx; /* Clear any previous fitting from the histogram. */ UnfitHistogram(h); /* Determine if we have enough hits to fit the histogram; * arbitrarily require 1000. */ if (h->total < 1000) { h->fit_type = HISTFIT_NONE; return 0; } /* Simplest algorithm for mean and sd; * no outlier detection yet (not even using high_hint) * * Magic 0.5 correction is because our histogram is for * scores between x and x+1; we estimate the expectation * (roughly) as x + 0.5. */ sum = sqsum = 0.; for (sc = h->lowscore; sc <= h->highscore; sc++) { delta = (float) sc + 0.5; sum += (float) h->histogram[sc-h->min] * delta; sqsum += (float) h->histogram[sc-h->min] * delta * delta; } h->fit_type = HISTFIT_GAUSSIAN; h->param[GAUSS_MEAN] = sum / (float) h->total; h->param[GAUSS_SD] = sqrt((sqsum - (sum*sum/(float)h->total)) / (float)(h->total-1)); /* Calculate the expected values for the histogram. * Note that the magic 0.5 correction appears again. * Calculating difference between distribution functions for Gaussian * would be correct but hard. */ hsize = h->max - h->min + 1; h->expect = (float *) MallocOrDie(sizeof(float) * hsize); for (idx = 0; idx < hsize; idx++) h->expect[idx] = 0.; for (sc = h->min; sc <= h->max; sc++) { delta = (float) sc + 0.5 - h->param[GAUSS_MEAN]; h->expect[sc - h->min] = (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) * (exp(-1.* delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD])))); } /* Calculate the goodness-of-fit (within region that was fitted) */ h->chisq = 0.; nbins = 0; for (sc = h->lowscore; sc <= h->highscore; sc++) if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) { delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; h->chisq += delta * delta / h->expect[sc-h->min]; nbins++; } /* -1 d.f. for normalization; -2 d.f. for two free parameters */ if (nbins > 3) h->chip = (float) IncompleteGamma((double)(nbins-3)/2., (double) h->chisq/2.); else h->chip = 0.; return 1;}/* Function: GaussianSetHistogram() * * Purpose: Instead of fitting the histogram to a Gaussian, * simply set the Gaussian parameters from an external source. */voidGaussianSetHistogram(struct histogram_s *h, float mean, float sd){ int sc; int hsize, idx; int nbins; float delta; UnfitHistogram(h); h->fit_type = HISTFIT_GAUSSIAN; h->param[GAUSS_MEAN] = mean; h->param[GAUSS_SD] = sd; /* Calculate the expected values for the histogram. */ hsize = h->max - h->min + 1; h->expect = (float *) MallocOrDie(sizeof(float) * hsize); for (idx = 0; idx < hsize; idx++) h->expect[idx] = 0.; /* Note: ideally we'd use the Gaussian distribution function * to find the histogram occupancy in the window sc..sc+1. * However, the distribution function is hard to calculate. * Instead, estimate the histogram by taking the density at sc+0.5. */ for (sc = h->min; sc <= h->max; sc++) { delta = ((float)sc + 0.5) - h->param[GAUSS_MEAN]; h->expect[sc - h->min] = (float) h->total * ((1. / (h->param[GAUSS_SD] * sqrt(2.*3.14159))) * (exp(-1.*delta*delta / (2. * h->param[GAUSS_SD] * h->param[GAUSS_SD])))); } /* Calculate the goodness-of-fit (within whole region) */ h->chisq = 0.; nbins = 0; for (sc = h->lowscore; sc <= h->highscore; sc++) if (h->expect[sc-h->min] >= 5. && h->histogram[sc-h->min] >= 5) { delta = (float) h->histogram[sc-h->min] - h->expect[sc-h->min]; h->chisq += delta * delta / h->expect[sc-h->min]; nbins++; } /* -1 d.f. for normalization */ if (nbins > 1) h->chip = (float) IncompleteGamma((double)(nbins-1)/2., (double) h->chisq/2.); else h->chip = 0.; }/* Function: EVDDensity() * Date: SRE, Sat Nov 15 19:37:52 1997 [St. Louis] * * Purpose: Return the extreme value density P(S=x) at * a given point x, for an EVD controlled by * parameters mu and lambda. */doubleEVDDensity(float x, float mu, float lambda){ return (lambda * exp(-1. * lambda * (x - mu) - exp(-1. * lambda * (x - mu))));}/* Function: EVDDistribution() * Date: SRE, Tue Nov 18 08:02:22 1997 [St. Louis] * * Purpose: Returns the extreme value distribution P(S < x) * evaluated at x, for an EVD controlled by parameters * mu and lambda. */doubleEVDDistribution(float x, float mu, float lambda){ return (exp(-1. * exp(-1. * lambda * (x - mu))));}/* Function: ExtremeValueP() * * Purpose: Calculate P(S>x) according to an extreme
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -