📄 mdist.pl

📁 Oracle optimizing performance things
💻 PL
字号:
#!/usr/bin/perl# $Header: /home/cvs/cvm-book1/mdist/mdist.pl,v 1.8 2002/12/02 15:06:49 cvm Exp $# Cary Millsap (cary.millsap@hotsos.com)# Copyright (c) 2002 by Hotsos Enterprises, Ltd. All rights reserved.use strict;use warnings;use Getopt::Long;use Statistics::Distributions qw(chisqrdistr chisqrprob);my $VERSION = do { my @r=(q$Revision: 1.8 $=~/\d+/g); sprintf "%d."."%02d"x$#r,@r };my $DATE    = do { my @d=(q$Date: 2002/12/02 15:06:49 $=~/\d{4}\D\d{2}\D\d{2}/g); sprintf $d[0] }; my $PROGRAM = "Test for Fit to Exponential Distribution";my %OPT;sub x2($$) {	my ($mu, $p) = @_;	# The p that &Statistics::Distributions::chisqrdistr expects is the	# complement of what we find in [Knuth (1981) 41], Mathematica, or	# [CRC (1991) 515].	return chisqrdistr($mu, 1-$p);}sub CDFx2($$) {	my ($n, $x2) = @_;	# The p that &Statistics::Distributions::chisqrprob returns is the	# complement of what we find in [Knuth (1981) 41], Mathematica, or	# [CRC (1991) 515].	return 1 - chisqrprob($n, $x2);}sub mdist(%) {	my %arg = (		list		=> [],					# list of values to test		mean		=> undef,				# expected mean of distribution		quantiles	=> undef,				# number of quantiles for test		@_,									# input args override defaults	);	# Assign the list. If there aren't at least 50 observations in the	# list, then the chi-square test is not valid [Olkin et al. (1994)	# 613].	my @list = @{$arg{list}};	die "Not enough data (need at least 50 observations)\n" unless @list >= 50;		# Compute number of quantiles and the number of expected observations	# for each quantile. If there aren't at least 5 observations expected	# in each quantile, then we have too many quantiles [Knuth (1981) 42]	# and [Olkin et al. (1994) 613].	my $quantiles = $arg{quantiles} ? $arg{quantiles} : 4;	my $m = @list/$quantiles;				# we expect quantiles to have identical areas	die "Too many quantiles (using $quantiles quantiles reqiures at least ". 5*$quantiles ." observations)\n" unless $m >= 5;	# Assign the mean and chi-square degrees of freedom. If no mean was	# passed in, then estimate it. But if we estimate the mean, then we	# lose an additional chi-square degree of freedom.	my $mean = $arg{mean};	my $n_loss = 1;							# lose one degree of freedom for guessing quantiles	if (!defined $mean) {		my $s = 0; $s += $_ for @list;		# sum the observed values		$mean = $s/@list;					# compute the sample mean		$n_loss++;							# lose additional d.o.f. for estimating the mean	}	my $n = $quantiles - $n_loss;			# chi-square degrees of freedom	die "Not enough quantiles for $n_loss lost degrees of freedom (need at least ". ($n_loss+1) ." quantiles)\n" unless $n >= 1;	# Dump values computed thus far.	if ($OPT{debug}>=1) {		print  "list      = (", join(", ", @list), ")\n";		printf "quantiles = %d\n", $quantiles;		printf "mean      = %s\n", $mean;	}	# Compute interior quantile boundaries. N.B.: The definition of	# quantile boundaries is what makes this test for exponential	# distribution different from a test for some other distribution.  If	# the input list is exponentially distributed, then we expect for	# there to be $m observations in each quantile, with quantile	# boundaries defined at -$mean*log(1-$i/$quantiles) for each	# i=1..$quantiles-1.	my @q;									# list of interior quantile boundaries	for (my $i=1; $i<=$quantiles-1; $i++) {		$q[$i] = -$mean*log(1-$i/$quantiles);	}	# Compute frequency of observed values [Knuth (1981) 40]. Setting	# $Y[0]=undef makes array content begin at $Y[1], which simplifies	# array indexing throughout.	my @Y = (undef, (0) x $quantiles);	for my $e (@list) {		print "e=$e\n" if $OPT{debug}>=3;		for (my $i=1; $i<=$quantiles; $i++) {			print "  i=$i (before): q[$i]=$q[$i]\n" if $OPT{debug}>=3;			if ($i == $quantiles) { $Y[-1]++; print "    Y[-1]->$Y[-1]\n" if $OPT{debug}>=3; last }			if ($e <= $q[$i])     { $Y[$i]++; print "    Y[$i]->$Y[$i]\n" if $OPT{debug}>=3; last }		}	}	# Populate list containing frequency of expected values per quantile	# [Knuth (1981) 40]. Using a data structure for this is unnecessarily	# complicated for this test, but it might make the program easier to	# adapt to tests for other distributions in other applications. (We	# could have simply used $m anyplace we mention $np[$anything].)	my @np = (undef, ($m) x $quantiles);	# Dump data structure contents if debugging.	if ($OPT{debug}>=1) {		print "mean = $mean\n";		print "q  = (", join(", ",  @q[1 .. $quantiles-1]), ")\n";		print "Y  = (", join(", ",  @Y[1 .. $quantiles]  ), ")\n";		print "np = (", join(", ", @np[1 .. $quantiles]  ), ")\n";	}	# Compute the chi-square statistic [Knuth (1981) 40].	my $V = 0;	$V += ($Y[$_] - $np[$_])**2 / $np[$_] for (1 .. $quantiles);		# Compute verdict as a function of where V fits in the appropriate	# degrees-of-freedom row of the chi-square statistical table. From	# [Knuth (1981) 43-44].	my $verdict;	my $p = CDFx2($n, $V);	if    ($p <  0.01) { $verdict = "Reject" }	elsif ($p <  0.05) { $verdict = "Suspect" }	elsif ($p <  0.10) { $verdict = "Almost suspect" }	elsif ($p <= 0.90) { $verdict = "Accept" }	elsif ($p <= 0.95) { $verdict = "Almost suspect" }	elsif ($p <= 0.99) { $verdict = "Suspect" }	else               { $verdict = "Reject" }	# Return a hash containing the verdict and key statistics.	return (verdict=>$verdict, mean=>$mean, n=>$n, V=>$V, p=>$p);}%OPT = (									# default values	mean			=> undef,	quantiles		=> undef,	debug			=> 0,	version			=> 0,	help			=> 0,);GetOptions(	"mean=f"		=> \$OPT{mean},	"quantiles=i"	=> \$OPT{quantiles},	"debug=i"		=> \$OPT{debug},	"version"		=> \$OPT{version},	"help"			=> \$OPT{help},);if ($OPT{version}) { print "$VERSION\n"; exit }if ($OPT{help})    { print "Type 'perldoc $0' for help\n"; exit }my $file = shift; $file = "&STDIN" if !defined $file;open FILE, "<$file" or die "can't read '$file' ($!)";my @list;while (defined (my $line = <FILE>)) {	next if $line =~ /^#/;	next if $line =~ /^\s*$/;	chomp $line;	for ($line) {		s/[^0-9.]/ /g;		s/^\s*//g;		s/\s*$//g;	}	push @list, split(/\s+/, $line);}close FILE;print join(", ", @list), "\n" if $OPT{debug};my %r = mdist(list=>[@list], mean=>$OPT{mean}, quantiles=>$OPT{quantiles});print  " Hypothesis: Data are exponentially distributed with mean $r{mean}\n";printf "Test result: n=%d V=%.2f p=%.3f\n", $r{n}, $r{V}, $r{p};print  "    Verdict: $r{verdict}\n";__END__=head1 NAMEmdist - test data for fit to exponential distribution with specified mean=head1 SYNOPSISmdist [--mean=I<m>] [--quantiles=I<q>] [I<file>]=head1 DESCRIPTIONB<mdist> tests whether a random variable appears to be exponentiallydistributed with mean I<m>. This information is useful in determining, forexample, whether a given list of operationally collected interarrivaltimes or service times is suitable input for the M/M/m queueing theorymodel.B<mdist> reads I<file> for a list of observed values. If no input file isspecified, then B<mdist> takes its input from STDIN. The input mustcontain at least 50 observations.The program prints the test hypothesis, the test results, and a verdict:  $ perl mdist.pl 001.d   Hypothesis: Data are exponentially distributed with mean 0.000959673232  Test result: n=2 V=0.72 p=0.302      Verdict: Accept  The test result statistics [Knuth (1981) 39-45] include:=over 4=item I<n>Degrees of freedom from the chi-square test.=item I<V>The "chi-square" statistic.=item I<p>The probability at which I<V> is expected to occur in a chi-squaredistribution with degrees of freedom equal to I<n>.=backB<mdist> uses a I<q>-quantile chi-square test for exponentialdistribution, adapted from [Allen (1994) 224-225] and [Knuth (1981)39-40]. Allen provides the strategy for divvying the data into quantilesand testing whether the frequency in each quantile matches our expectationof the exponential distribution. Knuth provides the general chi-squaretesting strategy that produces a verdict of "Accept", "Almost suspect","Suspect", or "Reject".=head2 Options=over 4=item B<--mean=>I<m>The hypothesized expected value of the random variable (i.e., thehypothesized mean of the exponential distribution). If no I<m> isspecified, then B<mdist> will use the sample mean of the input andadjust the chi-square degrees of freedom parameter appropriately.=item B<--quantiles=>I<q>The number of quantiles to use in the chi-square test for goodness-of-fit.The number of quantiles must equal at least 2 if a mean is specified, and3 if the mean will be estimated. The number of quantiles must be smallenough that the number of observations divided by I<q> must be at least 5.The default is I<q>=4.=back=head1 AUTHORCary Millsap (cary.millsap@hotsos.com)=head1 BUGSInstead of estimating the distribution mean by computing the sample mean,we should probalby use the minimum chi-squared estimation techniquedescribed in [Olkin et al. (1994) 617-623].=head1 SEE ALSOAllen, A. O. 1994. Computer Performance Analysis with Mathematica. BostonMA: AP ProfessionalCRC 1991. Standard Mathematical Tables and Formulae, 29ed. Boca Raton FL:CRC PressKnuth, D. E. 1981. The Art of Computer Programming, Vol 2: SeminumericalAlgorithms. Reading MA: Addison WesleyOlkin, I.; Gleser, L. J.; Derman, C. 1994. Probability Models andApplications, 2ed. New York NY: MacmillanWolfram, S. 1999. Mathematica. Champaign IL: Wolfram=head1 COPYRIGHTCopyright (c) 2002 by Hotsos Enterprises, Ltd. All rights reserved.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -