📄 mdist.pl
字号:
#!/usr/bin/perl# $Header: /home/cvs/cvm-book1/mdist/mdist.pl,v 1.8 2002/12/02 15:06:49 cvm Exp $# Cary Millsap (cary.millsap@hotsos.com)# Copyright (c) 2002 by Hotsos Enterprises, Ltd. All rights reserved.use strict;use warnings;use Getopt::Long;use Statistics::Distributions qw(chisqrdistr chisqrprob);my $VERSION = do { my @r=(q$Revision: 1.8 $=~/\d+/g); sprintf "%d."."%02d"x$#r,@r };my $DATE = do { my @d=(q$Date: 2002/12/02 15:06:49 $=~/\d{4}\D\d{2}\D\d{2}/g); sprintf $d[0] }; my $PROGRAM = "Test for Fit to Exponential Distribution";my %OPT;sub x2($$) { my ($mu, $p) = @_; # The p that &Statistics::Distributions::chisqrdistr expects is the # complement of what we find in [Knuth (1981) 41], Mathematica, or # [CRC (1991) 515]. return chisqrdistr($mu, 1-$p);}sub CDFx2($$) { my ($n, $x2) = @_; # The p that &Statistics::Distributions::chisqrprob returns is the # complement of what we find in [Knuth (1981) 41], Mathematica, or # [CRC (1991) 515]. return 1 - chisqrprob($n, $x2);}sub mdist(%) { my %arg = ( list => [], # list of values to test mean => undef, # expected mean of distribution quantiles => undef, # number of quantiles for test @_, # input args override defaults ); # Assign the list. If there aren't at least 50 observations in the # list, then the chi-square test is not valid [Olkin et al. (1994) # 613]. my @list = @{$arg{list}}; die "Not enough data (need at least 50 observations)\n" unless @list >= 50; # Compute number of quantiles and the number of expected observations # for each quantile. If there aren't at least 5 observations expected # in each quantile, then we have too many quantiles [Knuth (1981) 42] # and [Olkin et al. (1994) 613]. my $quantiles = $arg{quantiles} ? $arg{quantiles} : 4; my $m = @list/$quantiles; # we expect quantiles to have identical areas die "Too many quantiles (using $quantiles quantiles reqiures at least ". 5*$quantiles ." observations)\n" unless $m >= 5; # Assign the mean and chi-square degrees of freedom. If no mean was # passed in, then estimate it. But if we estimate the mean, then we # lose an additional chi-square degree of freedom. my $mean = $arg{mean}; my $n_loss = 1; # lose one degree of freedom for guessing quantiles if (!defined $mean) { my $s = 0; $s += $_ for @list; # sum the observed values $mean = $s/@list; # compute the sample mean $n_loss++; # lose additional d.o.f. for estimating the mean } my $n = $quantiles - $n_loss; # chi-square degrees of freedom die "Not enough quantiles for $n_loss lost degrees of freedom (need at least ". ($n_loss+1) ." quantiles)\n" unless $n >= 1; # Dump values computed thus far. if ($OPT{debug}>=1) { print "list = (", join(", ", @list), ")\n"; printf "quantiles = %d\n", $quantiles; printf "mean = %s\n", $mean; } # Compute interior quantile boundaries. N.B.: The definition of # quantile boundaries is what makes this test for exponential # distribution different from a test for some other distribution. If # the input list is exponentially distributed, then we expect for # there to be $m observations in each quantile, with quantile # boundaries defined at -$mean*log(1-$i/$quantiles) for each # i=1..$quantiles-1. my @q; # list of interior quantile boundaries for (my $i=1; $i<=$quantiles-1; $i++) { $q[$i] = -$mean*log(1-$i/$quantiles); } # Compute frequency of observed values [Knuth (1981) 40]. Setting # $Y[0]=undef makes array content begin at $Y[1], which simplifies # array indexing throughout. my @Y = (undef, (0) x $quantiles); for my $e (@list) { print "e=$e\n" if $OPT{debug}>=3; for (my $i=1; $i<=$quantiles; $i++) { print " i=$i (before): q[$i]=$q[$i]\n" if $OPT{debug}>=3; if ($i == $quantiles) { $Y[-1]++; print " Y[-1]->$Y[-1]\n" if $OPT{debug}>=3; last } if ($e <= $q[$i]) { $Y[$i]++; print " Y[$i]->$Y[$i]\n" if $OPT{debug}>=3; last } } } # Populate list containing frequency of expected values per quantile # [Knuth (1981) 40]. Using a data structure for this is unnecessarily # complicated for this test, but it might make the program easier to # adapt to tests for other distributions in other applications. (We # could have simply used $m anyplace we mention $np[$anything].) my @np = (undef, ($m) x $quantiles); # Dump data structure contents if debugging. if ($OPT{debug}>=1) { print "mean = $mean\n"; print "q = (", join(", ", @q[1 .. $quantiles-1]), ")\n"; print "Y = (", join(", ", @Y[1 .. $quantiles] ), ")\n"; print "np = (", join(", ", @np[1 .. $quantiles] ), ")\n"; } # Compute the chi-square statistic [Knuth (1981) 40]. my $V = 0; $V += ($Y[$_] - $np[$_])**2 / $np[$_] for (1 .. $quantiles); # Compute verdict as a function of where V fits in the appropriate # degrees-of-freedom row of the chi-square statistical table. From # [Knuth (1981) 43-44]. my $verdict; my $p = CDFx2($n, $V); if ($p < 0.01) { $verdict = "Reject" } elsif ($p < 0.05) { $verdict = "Suspect" } elsif ($p < 0.10) { $verdict = "Almost suspect" } elsif ($p <= 0.90) { $verdict = "Accept" } elsif ($p <= 0.95) { $verdict = "Almost suspect" } elsif ($p <= 0.99) { $verdict = "Suspect" } else { $verdict = "Reject" } # Return a hash containing the verdict and key statistics. return (verdict=>$verdict, mean=>$mean, n=>$n, V=>$V, p=>$p);}%OPT = ( # default values mean => undef, quantiles => undef, debug => 0, version => 0, help => 0,);GetOptions( "mean=f" => \$OPT{mean}, "quantiles=i" => \$OPT{quantiles}, "debug=i" => \$OPT{debug}, "version" => \$OPT{version}, "help" => \$OPT{help},);if ($OPT{version}) { print "$VERSION\n"; exit }if ($OPT{help}) { print "Type 'perldoc $0' for help\n"; exit }my $file = shift; $file = "&STDIN" if !defined $file;open FILE, "<$file" or die "can't read '$file' ($!)";my @list;while (defined (my $line = <FILE>)) { next if $line =~ /^#/; next if $line =~ /^\s*$/; chomp $line; for ($line) { s/[^0-9.]/ /g; s/^\s*//g; s/\s*$//g; } push @list, split(/\s+/, $line);}close FILE;print join(", ", @list), "\n" if $OPT{debug};my %r = mdist(list=>[@list], mean=>$OPT{mean}, quantiles=>$OPT{quantiles});print " Hypothesis: Data are exponentially distributed with mean $r{mean}\n";printf "Test result: n=%d V=%.2f p=%.3f\n", $r{n}, $r{V}, $r{p};print " Verdict: $r{verdict}\n";__END__=head1 NAMEmdist - test data for fit to exponential distribution with specified mean=head1 SYNOPSISmdist [--mean=I<m>] [--quantiles=I<q>] [I<file>]=head1 DESCRIPTIONB<mdist> tests whether a random variable appears to be exponentiallydistributed with mean I<m>. This information is useful in determining, forexample, whether a given list of operationally collected interarrivaltimes or service times is suitable input for the M/M/m queueing theorymodel.B<mdist> reads I<file> for a list of observed values. If no input file isspecified, then B<mdist> takes its input from STDIN. The input mustcontain at least 50 observations.The program prints the test hypothesis, the test results, and a verdict: $ perl mdist.pl 001.d Hypothesis: Data are exponentially distributed with mean 0.000959673232 Test result: n=2 V=0.72 p=0.302 Verdict: Accept The test result statistics [Knuth (1981) 39-45] include:=over 4=item I<n>Degrees of freedom from the chi-square test.=item I<V>The "chi-square" statistic.=item I<p>The probability at which I<V> is expected to occur in a chi-squaredistribution with degrees of freedom equal to I<n>.=backB<mdist> uses a I<q>-quantile chi-square test for exponentialdistribution, adapted from [Allen (1994) 224-225] and [Knuth (1981)39-40]. Allen provides the strategy for divvying the data into quantilesand testing whether the frequency in each quantile matches our expectationof the exponential distribution. Knuth provides the general chi-squaretesting strategy that produces a verdict of "Accept", "Almost suspect","Suspect", or "Reject".=head2 Options=over 4=item B<--mean=>I<m>The hypothesized expected value of the random variable (i.e., thehypothesized mean of the exponential distribution). If no I<m> isspecified, then B<mdist> will use the sample mean of the input andadjust the chi-square degrees of freedom parameter appropriately.=item B<--quantiles=>I<q>The number of quantiles to use in the chi-square test for goodness-of-fit.The number of quantiles must equal at least 2 if a mean is specified, and3 if the mean will be estimated. The number of quantiles must be smallenough that the number of observations divided by I<q> must be at least 5.The default is I<q>=4.=back=head1 AUTHORCary Millsap (cary.millsap@hotsos.com)=head1 BUGSInstead of estimating the distribution mean by computing the sample mean,we should probalby use the minimum chi-squared estimation techniquedescribed in [Olkin et al. (1994) 617-623].=head1 SEE ALSOAllen, A. O. 1994. Computer Performance Analysis with Mathematica. BostonMA: AP ProfessionalCRC 1991. Standard Mathematical Tables and Formulae, 29ed. Boca Raton FL:CRC PressKnuth, D. E. 1981. The Art of Computer Programming, Vol 2: SeminumericalAlgorithms. Reading MA: Addison WesleyOlkin, I.; Gleser, L. J.; Derman, C. 1994. Probability Models andApplications, 2ed. New York NY: MacmillanWolfram, S. 1999. Mathematica. Champaign IL: Wolfram=head1 COPYRIGHTCopyright (c) 2002 by Hotsos Enterprises, Ltd. All rights reserved.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -