📄 corpus.pm
字号:
#calculate first two moments for subset scores for each n-gram precision, and t statistic
my $fullCorpusBLEU = $self->{'bleuScores'}->{$sysname}->{$factorName}->[0]; #an arrayref
my @means = (0) x 4;
my @devs = (0) x 4;
my $t = []; #t statistics for all n-gram orders
if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};}
$self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []]; #lower-bound p-values for whole corpus vs. subset average; confidence intervals for all n-gram orders
for(my $i = 0; $i < 4; $i++) #run through n-gram orders
{
for(my $j = 0; $j < $k; $j++) #run through subsets
{
$means[$i] += $subsetStats->[$j]->[2 * $i] / $subsetStats->[$j]->[2 * $i + 1]; #matching / total n-grams
}
$means[$i] /= $k;
for(my $j = 0; $j < $k; $j++) #run through subsets
{
$devs[$i] += ($subsetStats->[$j]->[2 * $i] / $subsetStats->[$j]->[2 * $i + 1] - $means[$i]) ** 2;
}
$devs[$i] = sqrt($devs[$i] / ($k - 1));
$t->[$i] = ($fullCorpusBLEU->[$i + 1] / 100 - $means[$i]) / $devs[$i];
push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]}, getLowerBoundPValue($t->[$i]); #p-value for overall score vs. subset average
push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]},
[$means[$i] - $criticalTStat * $devs[$i] / sqrt($k), $means[$i] + $criticalTStat * $devs[$i] / sqrt($k)]; #the confidence interval
}
return $self->{'bleuConfidence'}->{$sysname}->{$factorName};
}
#arguments: system name, factor name
#return: perplexity of language model (specified in a config file) wrt given system output
sub calcPerplexity
{
my ($self, $sysname, $factorName) = @_;
print STDERR "ppl $sysname $factorName\n";
#check in-memory cache first
if(exists $self->{'perplexity'}->{$sysname} && exists $self->{'perplexity'}->{$sysname}->{$factorName})
{
return $self->{'perplexity'}->{$sysname}->{$factorName};
}
warn "calcing perplexity\n";
$self->ensureFilenameDefined($sysname);
my $sysoutFilename;
if($sysname eq 'truth' || $sysname eq 'input') {$sysoutFilename = $self->{"${sysname}Filename"};}
else {$sysoutFilename = $self->{'sysoutFilenames'}->{$sysname};}
my $lmFilename;
if($sysname eq 'input') {$lmFilename = $self->{'inputLMs'}->{$factorName};}
else {$lmFilename = $self->{'outputLMs'}->{$factorName};}
my $tmpfile = ".tmp" . time;
my $cmd = "perl ./extract-factors.pl $sysoutFilename " . $self->{'factorIndices'}->{$factorName} . " > $tmpfile";
`$cmd`; #extract just the factor we're interested in; ngram doesn't understand factored notation
my @output = `./ngram -lm $lmFilename -ppl $tmpfile`; #run the SRI n-gram tool
`rm -f $tmpfile`;
$output[1] =~ /ppl1=\s*([0-9\.]+)/;
$self->{'perplexity'}->{$sysname}->{$factorName} = $1;
return $self->{'perplexity'}->{$sysname}->{$factorName};
}
#run a paired t test and a sign test on BLEU statistics for subsets of both systems' outputs
#arguments: system name 1, system name 2, factor name
#return: arrayref of [arrayref of confidence levels for t test at which results differ, arrayref of index (0/1) of better system by t test,
# arrayref of confidence levels for sign test at which results differ, arrayref of index (0/1) of better system by sign test],
# where each inner arrayref has one element per n-gram order considered
sub statisticallyCompareSystemResults
{
my ($self, $sysname1, $sysname2, $factorName) = @_;
#check in-memory cache first
if(exists $self->{'comparisonStats'}->{$sysname1} && exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}
&& exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName})
{
return $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName};
}
warn "comparing sysoutputs\n";
$self->ensureFilenameDefined($sysname1);
$self->ensureFilenameDefined($sysname2);
$self->ensureFactorPosDefined($factorName);
#make sure we have tallied results for both systems
if(!exists $self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}) {$self->statisticallyTestBLEUResults($sysname1, $factorName);}
if(!exists $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}) {$self->statisticallyTestBLEUResults($sysname2, $factorName);}
if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};}
if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};}
if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];}
my ($tConfidences, $tWinningIndices, $signConfidences, $signWinningIndices) = ([], [], [], []);
for(my $i = 0; $i < 4; $i++) #loop over n-gram order
{
#t-test stats
my ($mean, $dev) = (0, 0); #of the difference between the first and second systems' precisions
#sign-test stats
my ($nPlus, $nMinus) = (0, 0);
my $j;
for($j = 0; $j < scalar(@{$self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}}); $j++)
{
my ($stats1, $stats2) = ($self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}->[$j], $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}->[$j]);
my ($prec1, $prec2) = ($stats1->[2 * $i] / $stats1->[2 * $i + 1], $stats2->[2 * $i] / $stats2->[2 * $i + 1]); #n-gram precisions
$mean += $prec1 - $prec2;
if($prec1 > $prec2) {$nPlus++;} else {$nMinus++;}
}
$mean /= $j;
for($j = 0; $j < scalar(@{$self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}}); $j++)
{
my ($stats1, $stats2) = ($self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}->[$j], $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}->[$j]);
my ($prec1, $prec2) = ($stats1->[2 * $i] / $stats1->[2 * $i + 1], $stats2->[2 * $i] / $stats2->[2 * $i + 1]); #n-gram precisions
$dev += ($prec1 - $prec2 - $mean) ** 2;
}
$dev = sqrt($dev / (($j - 1) * $j)); #need the extra j because the variance of Xbar is 1/n the variance of X
#t test
my $t = $mean / $dev; #this isn't the standard form; remember the difference of the means is equal to the mean of the differences
my $cc = getUpperBoundPValue($t);
print STDERR "comparing at n=$i: mu $mean, sigma $dev, t $t -> conf >= " . (1 - $cc) . "\n";
push @$tConfidences, $cc;
push @$tWinningIndices, ($mean > 0) ? 0 : 1;
#sign test
my %binomialCoefficients; #map (n+ - n-) to a coefficient; compute on the fly!
for(my $k = 0; $k <= $nPlus + $nMinus; $k++)
{
$binomialCoefficients{$k} = binCoeff($nPlus + $nMinus, $k);
}
my $sumCoeffs = 0;
foreach my $coeff (values %binomialCoefficients) #get a lower bound on the probability mass inside (n+ - n-)
{
if($coeff > $binomialCoefficients{$nPlus}) {$sumCoeffs += $coeff;}
}
push @$signConfidences, $sumCoeffs;
push @$signWinningIndices, ($nPlus > $nMinus) ? 0 : 1;
}
$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [$tConfidences, $tWinningIndices, $signConfidences, $signWinningIndices];
return $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName};
}
#write HTML to be displayed to compare the various versions we have of each sentence in the corpus;
#allow to filter which versions will be displayed
#(we don't write the whole page, just the contents of the body)
#arguments: filehandleref to which to write, regex to filter filename extensions to be included
#return: none
sub writeComparisonPage
{
my ($self, $fh, $filter) = @_;
my @filteredExtensions = grep($filter, ('e', 'f', keys %{$self->{'sysoutFilenames'}}));
my %openedFiles = $self->openFiles(@filteredExtensions);
my $id = 1; #sentence ID string
while(my %lines = $self->readLineFromFiles(%openedFiles))
{
$self->printSingleSentenceComparison($fh, $id, %lines);
$id++;
}
$self->closeFiles(%openedFiles);
}
##########################################################################################################
##### INTERNAL ###################################################################################
##########################################################################################################
#destructor!
#arguments: none
#return: none
sub DESTROY
{
my $self = shift;
$self->writeCacheFile();
}
#write all scores in memory to disk
#arguments: none
#return: none
sub writeCacheFile
{
my $self = shift;
if(!open(CACHEFILE, ">" . $self->{'cacheFilename'}))
{
warn "Corpus::writeCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for write\n";
return;
}
#store file changetimes to disk
print CACHEFILE "File changetimes\n";
my $ensureCtimeIsOutput = sub
{
my $ext = shift;
#check for a previously read value
if(exists $self->{'fileCtimes'}->{$ext} && $self->cacheIsCurrentForFile($ext)) {print CACHEFILE "$ext " . $self->{'fileCtimes'}->{$ext} . "\n";}
else {print CACHEFILE "$ext " . time . "\n";} #our info must just have been calculated
};
if(exists $self->{'truthFilename'}) {&$ensureCtimeIsOutput('e');}
if(exists $self->{'inputFilename'}) {&$ensureCtimeIsOutput('f');}
foreach my $factorName (keys %{$self->{'phraseTableFilenames'}}) {&$ensureCtimeIsOutput("pt_$factorName");}
foreach my $sysname (keys %{$self->{'sysoutFilenames'}}) {&$ensureCtimeIsOutput($sysname);}
#store bleu scores to disk
print CACHEFILE "\nBLEU scores\n";
foreach my $sysname (keys %{$self->{'bleuScores'}})
{
foreach my $factorName (keys %{$self->{'bleuScores'}->{$sysname}})
{
print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]});
foreach my $sentenceBLEU (@{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]})
{
print CACHEFILE ";" . join(' ', @$sentenceBLEU);
}
print CACHEFILE "\n";
}
}
#store t statistics for overall BLEU score and subsets in k-fold cross-validation
print CACHEFILE "\nBLEU statistics\n";
foreach my $sysname (keys %{$self->{'bleuConfidence'}})
{
foreach my $factorName (keys %{$self->{'bleuConfidence'}->{$sysname}})
{
print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]});
foreach my $subsetConfidence (@{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]})
{
print CACHEFILE ";" . join(' ', @$subsetConfidence);
}
print CACHEFILE "\n";
}
}
#store statistics comparing system outputs
print CACHEFILE "\nStatistical comparisons\n";
foreach my $sysname1 (keys %{$self->{'comparisonStats'}})
{
foreach my $sysname2 (keys %{$self->{'comparisonStats'}->{$sysname1}})
{
foreach my $factorName (keys %{$self->{'comparisonStats'}->{$sysname1}->{$sysname2}})
{
print CACHEFILE "$sysname1 $sysname2 $factorName " . join(';', map {join(' ', @$_)} @{$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}}) . "\n";
}
}
}
#store unknown-token counts to disk
print CACHEFILE "\nUnknown-token counts\n";
foreach my $factorName (keys %{$self->{'unknownCount'}})
{
print CACHEFILE $factorName . " " . $self->{'phraseTableFilenames'}->{$factorName} . " " . $self->{'unknownCount'}->{$factorName} . " " . $self->{'tokenCount'}->{'input'} . "\n";
}
#store WER, PWER to disk
print CACHEFILE "\nWER scores\n";
my $printWERFunc =
sub
{
my $werType = shift;
foreach my $sysname (keys %{$self->{$werType}})
{
foreach my $factorName (keys %{$self->{$werType}->{$sysname}})
{
my ($totalWER, $sentenceWERs, $errorWords) = @{$self->{$werType}->{$sysname}->{$factorName}};
print CACHEFILE "$werType $sysname $factorName $totalWER " . join(' ', @$sentenceWERs);
foreach my $indices (@$errorWords)
{
print CACHEFILE ";" . join(' ', @$indices);
}
print CACHEFILE "\n";
}
}
};
&$printWERFunc('sysoutWER');
&$printWERFunc('sysoutPWER');
#store corpus perplexities to disk
print CACHEFILE "\nPerplexity\n";
foreach my $sysname (keys %{$self->{'perplexity'}})
{
foreach my $factorName (keys %{$self->{'perplexity'}->{$sysname}})
{
print CACHEFILE "$sysname $factorName " . $self->{'perplexity'}->{$sysname}->{$factorName} . "\n";
}
}
print "\nNN/ADJ WER/PWER\n";
foreach my $sysname (keys %{$self->{'nnAdjWERPWER'}})
{
print CACHEFILE "$sysname " . join(' ', @{$self->{'nnAdjWERPWER'}->{$sysname}}) . "\n";
}
print "\n";
close(CACHEFILE);
}
#load all scores present in the cache file into the appropriate fields of $self
#arguments: none
#return: none
sub loadCacheFile
{
my $self = shift;
if(!open(CACHEFILE, "<" . $self->{'cacheFilename'}))
{
warn "Corpus::loadCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for read\n";
return;
}
my $mode = 'none';
while(my $line = <CACHEFILE>)
{
next if $line =~ /^[ \t\n\r\x0a]*$/; #anyone know why char 10 (0x0a) shows up on empty lines, at least on solaris?
chomp $line;
#check for start of section
if($line =~ /File changetimes/) {$mode = 'ctime';}
elsif($line =~ /BLEU scores/) {$mode = 'bleu';}
elsif($line =~ /BLEU statistics/) {$mode = 'bstats';}
elsif($line =~ /Statistical comparisons/) {$mode = 'cmp';}
elsif($line =~ /Unknown-token counts/) {$mode = 'unk';}
elsif($line =~ /WER scores/) {$mode = 'wer';}
elsif($line =~ /Perplexity/) {$mode = 'ppl';}
elsif($line =~ /NN\/ADJ WER\/PWER/) {$mode = 'nawp';}
#get data when in a mode already
elsif($mode eq 'ctime')
{
local ($fileExtension, $ctime) = split(/\s+/, $line);
$self->{'fileCtimes'}->{$fileExtension} = $ctime;
}
elsif($mode eq 'bleu')
{
local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3);
next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e');
if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};}
if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest);
print STDERR "bleu 1: " . join(', ', @{shift @stats}) . "\n";
print STDERR "bleu 2: " . join(' ', map {"{" . join(', ', @$_) . "}"} @stats) . "\n";
# $self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = shift @stats;
# $self->{'bleuScores'}->{$sysname}->{$factorName}->[1] = \@stats;
}
elsif($mode eq 'bstats')
{
local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3);
next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e');
if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};}
if(!exists $self->{'bleuConfidence'}->{$sysname}->{$factorName}) {$self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []];}
my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest);
$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0] = shift @stats;
$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1] = \@stats;
}
elsif($mode eq 'cmp')
{
local ($sysname1, $sysname2, $factorName, $rest) = split(/\s+/, $line, 4);
next if !$self->cacheIsCurrentForFile($sysname1) || !$self->cacheIsCurrentForFile($sysname2) || !$self->cacheIsCurrentForFile('e');
if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};}
if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};}
if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];}
my @stats = map {my @x = split(' ', $_); \@x} split(/;/, $rest);
$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = \@stats;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -