📄 rainbow-stats.pl

📁 贝叶斯学习算法分类文本。基于朴素贝叶斯分类器的文本分类的通用算法
💻 PL
字号:
#!/usr/bin/perl# The above line is modified by ./Makefile to match the system's# installed location for Perl.# Script to process the output from Andrew's rainbow program and produce# useful summaries of the results. Feed the results intot stdin and # all the summaries will arrive on stdout# If you pass the `-s' command line argument, print only the accuracy# average and standard deviation.# setup some default values$total_accuracy = 0.0;# When this is zero, only print accuracy average and std.dev.$verbosity = 1;# Prune this regex from the end of classnames.$prune_from_classname = "";if ($#ARGV >= 0 && $ARGV[0] eq "-s") {    $verbosity = 0;    shift;}if ($#ARGV >= 0 && $ARGV[0] eq "-p") {    $prune_from_classname = $ARGV[1];    printf "Pruning `%s' from classnames\n", $prune_from_classname;    shift; shift;}# Read in the first #$line = <>;$trial = 0;while (&read_trial() != 0) {    # OK - Lets start with accuracy    &calculate_accuracy();    # Now, how about a confusion matrix.    &confusion();    $trial++;}# Maybe some summary?# We've had $trial trials&overall_accuracy();exit;# Function to read in the results for one trial into three arrays - @ids, # @actual_classifications and @predicted_classifications# What is the English description of these?# @ids - # @actual_classifications - # @predicted_classifications - sub read_trial {        undef @ids;    undef @actual_classifications;    undef @predicted_classifications;    undef %classes_to_codes;    undef @codes_to_classes;    while (($line = <>) && ($line !~ /^\#[0-9]+$/)) {		chop $line;	@line = split(' ', $line);	# Remove the filename from @line and append it to @ids	push(@ids, shift @line);	if (length ($prune_from_classname) > 0) {	    # Remove $prune_from_classname from end of the actual classname	    #printf ("Before: %s  ", $line[0]);	    $pruning_regex = sprintf ("^(.+)%s\$", $prune_from_classname);	    $line[0] =~ s,$pruning_regex,\1,;	    #printf ("After: %s\n", $line[0]);	    # Remove $prune_from_classname from end of the predicted classnames	    $pruning_regex = 		sprintf ("^(.+)%s(:[\.0-9e+\-]+)\$", $prune_from_classname);	    for ($i = 1; $i < @line; $i++) {	        #printf ("Before: %s  ", $line[$i]);		$line[$i] =~ s,$pruning_regex,\1\2,;	        #printf ("After: %s\n", $line[$i]);	    }	}	# Ensure we have a code for the actual class	if (grep(/^$line[0]$/, @codes_to_classes) == 0) {	    $classes_to_codes{$line[0]} = @codes_to_classes;	    push(@codes_to_classes, $line[0]);	}	push(@actual_classifications, shift @line);			    	push(@predicted_classifications, [ @line ]);	# Make sure we have codes for everything	foreach $pred (@line) {	    $pred =~ /^(.+):[\.0-9e+\-]+$/;	    if (grep(/^$1$/, @codes_to_classes) == 0) {		$classes_to_codes{$1} = @codes_to_classes;		push(@codes_to_classes, $1);	    }	}    }    if (@ids > 0) {	return 1;    } else {	return 0;    }}# Function to take the three arrays and calculate the accuracy of the# runsub calculate_accuracy {    if ($verbosity > 0) {	print "Trial $trial\n\n";    }    # Initialize the variables in which we'll gather stats    $correct = 0;    $total = 0;    for ($i = 0; $i < @ids; $i++) {	$predicted_classifications[$i][0] =~ /^(.+):[\.0-9e+\-]+$/;	if ($actual_classifications[$i] eq $1) {	    $correct++;	}	$total++;    }    $accuracy = ($correct * 100) / $total;    $trial_accuracy[$trial] = $accuracy;    $total_accuracy += $accuracy;    if ($verbosity > 0) {	printf ("Correct: %d out of %d (%.2f percent accuracy)\n",		$correct, $total, $accuracy);    }}sub overall_accuracy {    # Calculte the overall (overall) accuracy    $overall_accuracy = $total_accuracy / $trial;    # Calculate the standard deviation of Overall Accuracy    $overall_accuracy_stddev = 0;    for ($i = 0; $i < $trial; $i++) {	$diff_from_mean = $overall_accuracy - $trial_accuracy[$i];	$overall_accuracy_stddev += $diff_from_mean * $diff_from_mean;    }    $overall_accuracy_stddev = sqrt ($overall_accuracy_stddev / $trial);    if ($verbosity > 0) {	printf ("Percent_Accuracy  average %.2f stderr %.2f\n", 		$overall_accuracy, 		$overall_accuracy_stddev / sqrt($trial));    } else {	printf ("%.2f %.2f\n", 		$overall_accuracy, 		$overall_accuracy_stddev / sqrt($trial));    }}# Function to produce a confusion matrix from the datasub confusion {        undef @confusion;    my $total_predicted;    if (! $verbosity > 0) {	return;    }    print "\n - Confusion details, row is actual, column is predicted\n";    # Loop over all the examples    for ($i = 0; $i < @ids; $i++) {	$actual = $actual_classifications[$i];	$actual_code = $classes_to_codes{$actual};	$predicted_classifications[$i][0] =~ /^(.+):[\.0-9e+\-]+$/;	$predicted_code = $classes_to_codes{$1};	$confusion[$actual_code][$predicted_code] += 1;    }    # Get the maximum classname length, so we know how much space    # to allow for it in the formatting.    $max_classname_length = length ("classname");    for ($i = 0; $i < @codes_to_classes; $i++) {	$classname_length = length ($codes_to_classes[$i]);	if ($classname_length > $max_classname_length) {	    $max_classname_length = $classname_length;	}    }    # Print out a header for the matrix    printf ("   %${max_classname_length}s ", "classname");    for ($i = 0; $i < @codes_to_classes; $i++) {	printf ("%3d ", $i);	    }    print " :total\n";    # Now print out the matrix    for ($i = 0; $i < @codes_to_classes; $i++) {	printf ("%2d %${max_classname_length}s ", 		$i, $codes_to_classes[$i]);	$total_predicted = 0;	for ($j = 0; $j < @codes_to_classes; $j++) {	    if ($confusion[$i][$j] == 0) {		printf ("%3s ", ".");	    } else {		printf ("%3d ", $confusion[$i][$j]);	    }	    $total_predicted += $confusion[$i][$j];	}	if ($total_predicted > 0) {	    printf (" :%3d %6.2f%%", 		    $total_predicted, 		    100 * $confusion[$i][$i] / $total_predicted);	} else {	    printf (" :%3s", ".");	}	print "\n";    }    print "\n";}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -