📄 corpus.pm.svn-base
字号:
#calculate first two moments for subset scores for each n-gram precision, and t statistic my $fullCorpusBLEU = $self->{'bleuScores'}->{$sysname}->{$factorName}->[0]; #an arrayref my @means = (0) x 4; my @devs = (0) x 4; my $t = []; #t statistics for all n-gram orders if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};} $self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []]; #lower-bound p-values for whole corpus vs. subset average; confidence intervals for all n-gram orders for(my $i = 0; $i < 4; $i++) #run through n-gram orders { for(my $j = 0; $j < $k; $j++) #run through subsets { $means[$i] += $subsetStats->[$j]->[2 * $i] / $subsetStats->[$j]->[2 * $i + 1]; #matching / total n-grams } $means[$i] /= $k; for(my $j = 0; $j < $k; $j++) #run through subsets { $devs[$i] += ($subsetStats->[$j]->[2 * $i] / $subsetStats->[$j]->[2 * $i + 1] - $means[$i]) ** 2; } $devs[$i] = sqrt($devs[$i] / ($k - 1)); $t->[$i] = ($fullCorpusBLEU->[$i + 1] / 100 - $means[$i]) / $devs[$i]; push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]}, getLowerBoundPValue($t->[$i]); #p-value for overall score vs. subset average push @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]}, [$means[$i] - $criticalTStat * $devs[$i] / sqrt($k), $means[$i] + $criticalTStat * $devs[$i] / sqrt($k)]; #the confidence interval } return $self->{'bleuConfidence'}->{$sysname}->{$factorName};}#arguments: system name, factor name#return: perplexity of language model (specified in a config file) wrt given system outputsub calcPerplexity{ my ($self, $sysname, $factorName) = @_; print STDERR "ppl $sysname $factorName\n"; #check in-memory cache first if(exists $self->{'perplexity'}->{$sysname} && exists $self->{'perplexity'}->{$sysname}->{$factorName}) { return $self->{'perplexity'}->{$sysname}->{$factorName}; } warn "calcing perplexity\n"; $self->ensureFilenameDefined($sysname); my $sysoutFilename; if($sysname eq 'truth' || $sysname eq 'input') {$sysoutFilename = $self->{"${sysname}Filename"};} else {$sysoutFilename = $self->{'sysoutFilenames'}->{$sysname};} my $lmFilename; if($sysname eq 'input') {$lmFilename = $self->{'inputLMs'}->{$factorName};} else {$lmFilename = $self->{'outputLMs'}->{$factorName};} my $tmpfile = ".tmp" . time; my $cmd = "perl ./extract-factors.pl $sysoutFilename " . $self->{'factorIndices'}->{$factorName} . " > $tmpfile"; `$cmd`; #extract just the factor we're interested in; ngram doesn't understand factored notation my @output = `./ngram -lm $lmFilename -ppl $tmpfile`; #run the SRI n-gram tool `rm -f $tmpfile`; $output[1] =~ /ppl1=\s*([0-9\.]+)/; $self->{'perplexity'}->{$sysname}->{$factorName} = $1; return $self->{'perplexity'}->{$sysname}->{$factorName};}#run a paired t test and a sign test on BLEU statistics for subsets of both systems' outputs#arguments: system name 1, system name 2, factor name#return: arrayref of [arrayref of confidence levels for t test at which results differ, arrayref of index (0/1) of better system by t test,# arrayref of confidence levels for sign test at which results differ, arrayref of index (0/1) of better system by sign test], # where each inner arrayref has one element per n-gram order consideredsub statisticallyCompareSystemResults{ my ($self, $sysname1, $sysname2, $factorName) = @_; #check in-memory cache first if(exists $self->{'comparisonStats'}->{$sysname1} && exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2} && exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) { return $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}; } warn "comparing sysoutputs\n"; $self->ensureFilenameDefined($sysname1); $self->ensureFilenameDefined($sysname2); $self->ensureFactorPosDefined($factorName); #make sure we have tallied results for both systems if(!exists $self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}) {$self->statisticallyTestBLEUResults($sysname1, $factorName);} if(!exists $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}) {$self->statisticallyTestBLEUResults($sysname2, $factorName);} if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};} if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};} if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];} my ($tConfidences, $tWinningIndices, $signConfidences, $signWinningIndices) = ([], [], [], []); for(my $i = 0; $i < 4; $i++) #loop over n-gram order { #t-test stats my ($mean, $dev) = (0, 0); #of the difference between the first and second systems' precisions #sign-test stats my ($nPlus, $nMinus) = (0, 0); my $j; for($j = 0; $j < scalar(@{$self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}}); $j++) { my ($stats1, $stats2) = ($self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}->[$j], $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}->[$j]); my ($prec1, $prec2) = ($stats1->[2 * $i] / $stats1->[2 * $i + 1], $stats2->[2 * $i] / $stats2->[2 * $i + 1]); #n-gram precisions $mean += $prec1 - $prec2; if($prec1 > $prec2) {$nPlus++;} else {$nMinus++;} } $mean /= $j; for($j = 0; $j < scalar(@{$self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}}); $j++) { my ($stats1, $stats2) = ($self->{'subsetBLEUstats'}->{$sysname1}->{$factorName}->[$j], $self->{'subsetBLEUstats'}->{$sysname2}->{$factorName}->[$j]); my ($prec1, $prec2) = ($stats1->[2 * $i] / $stats1->[2 * $i + 1], $stats2->[2 * $i] / $stats2->[2 * $i + 1]); #n-gram precisions $dev += ($prec1 - $prec2 - $mean) ** 2; } $dev = sqrt($dev / (($j - 1) * $j)); #need the extra j because the variance of Xbar is 1/n the variance of X #t test my $t = $mean / $dev; #this isn't the standard form; remember the difference of the means is equal to the mean of the differences my $cc = getUpperBoundPValue($t); print STDERR "comparing at n=$i: mu $mean, sigma $dev, t $t -> conf >= " . (1 - $cc) . "\n"; push @$tConfidences, $cc; push @$tWinningIndices, ($mean > 0) ? 0 : 1; #sign test my %binomialCoefficients; #map (n+ - n-) to a coefficient; compute on the fly! for(my $k = 0; $k <= $nPlus + $nMinus; $k++) { $binomialCoefficients{$k} = binCoeff($nPlus + $nMinus, $k); } my $sumCoeffs = 0; foreach my $coeff (values %binomialCoefficients) #get a lower bound on the probability mass inside (n+ - n-) { if($coeff > $binomialCoefficients{$nPlus}) {$sumCoeffs += $coeff;} } push @$signConfidences, $sumCoeffs; push @$signWinningIndices, ($nPlus > $nMinus) ? 0 : 1; } $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [$tConfidences, $tWinningIndices, $signConfidences, $signWinningIndices]; return $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName};}#write HTML to be displayed to compare the various versions we have of each sentence in the corpus;#allow to filter which versions will be displayed#(we don't write the whole page, just the contents of the body)#arguments: filehandleref to which to write, regex to filter filename extensions to be included#return: nonesub writeComparisonPage{ my ($self, $fh, $filter) = @_; my @filteredExtensions = grep($filter, ('e', 'f', keys %{$self->{'sysoutFilenames'}})); my %openedFiles = $self->openFiles(@filteredExtensions); my $id = 1; #sentence ID string while(my %lines = $self->readLineFromFiles(%openedFiles)) { $self->printSingleSentenceComparison($fh, $id, %lines); $id++; } $self->closeFiles(%openedFiles);}############################################################################################################### INTERNAL ##############################################################################################################################################################################################destructor!#arguments: none#return: nonesub DESTROY{ my $self = shift; $self->writeCacheFile();}#write all scores in memory to disk#arguments: none#return: nonesub writeCacheFile{ my $self = shift; if(!open(CACHEFILE, ">" . $self->{'cacheFilename'})) { warn "Corpus::writeCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for write\n"; return; } #store file changetimes to disk print CACHEFILE "File changetimes\n"; my $ensureCtimeIsOutput = sub { my $ext = shift; #check for a previously read value if(exists $self->{'fileCtimes'}->{$ext} && $self->cacheIsCurrentForFile($ext)) {print CACHEFILE "$ext " . $self->{'fileCtimes'}->{$ext} . "\n";} else {print CACHEFILE "$ext " . time . "\n";} #our info must just have been calculated }; if(exists $self->{'truthFilename'}) {&$ensureCtimeIsOutput('e');} if(exists $self->{'inputFilename'}) {&$ensureCtimeIsOutput('f');} foreach my $factorName (keys %{$self->{'phraseTableFilenames'}}) {&$ensureCtimeIsOutput("pt_$factorName");} foreach my $sysname (keys %{$self->{'sysoutFilenames'}}) {&$ensureCtimeIsOutput($sysname);} #store bleu scores to disk print CACHEFILE "\nBLEU scores\n"; foreach my $sysname (keys %{$self->{'bleuScores'}}) { foreach my $factorName (keys %{$self->{'bleuScores'}->{$sysname}}) { print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]}); foreach my $sentenceBLEU (@{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}) { print CACHEFILE ";" . join(' ', @$sentenceBLEU); } print CACHEFILE "\n"; } } #store t statistics for overall BLEU score and subsets in k-fold cross-validation print CACHEFILE "\nBLEU statistics\n"; foreach my $sysname (keys %{$self->{'bleuConfidence'}}) { foreach my $factorName (keys %{$self->{'bleuConfidence'}->{$sysname}}) { print CACHEFILE "$sysname $factorName " . join(' ', @{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0]}); foreach my $subsetConfidence (@{$self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1]}) { print CACHEFILE ";" . join(' ', @$subsetConfidence); } print CACHEFILE "\n"; } } #store statistics comparing system outputs print CACHEFILE "\nStatistical comparisons\n"; foreach my $sysname1 (keys %{$self->{'comparisonStats'}}) { foreach my $sysname2 (keys %{$self->{'comparisonStats'}->{$sysname1}}) { foreach my $factorName (keys %{$self->{'comparisonStats'}->{$sysname1}->{$sysname2}}) { print CACHEFILE "$sysname1 $sysname2 $factorName " . join(';', map {join(' ', @$_)} @{$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}}) . "\n"; } } } #store unknown-token counts to disk print CACHEFILE "\nUnknown-token counts\n"; foreach my $factorName (keys %{$self->{'unknownCount'}}) { print CACHEFILE $factorName . " " . $self->{'phraseTableFilenames'}->{$factorName} . " " . $self->{'unknownCount'}->{$factorName} . " " . $self->{'tokenCount'}->{'input'} . "\n"; } #store WER, PWER to disk print CACHEFILE "\nWER scores\n"; my $printWERFunc = sub { my $werType = shift; foreach my $sysname (keys %{$self->{$werType}}) { foreach my $factorName (keys %{$self->{$werType}->{$sysname}}) { my ($totalWER, $sentenceWERs, $errorWords) = @{$self->{$werType}->{$sysname}->{$factorName}}; print CACHEFILE "$werType $sysname $factorName $totalWER " . join(' ', @$sentenceWERs); foreach my $indices (@$errorWords) { print CACHEFILE ";" . join(' ', @$indices); } print CACHEFILE "\n"; } } }; &$printWERFunc('sysoutWER'); &$printWERFunc('sysoutPWER'); #store corpus perplexities to disk print CACHEFILE "\nPerplexity\n"; foreach my $sysname (keys %{$self->{'perplexity'}}) { foreach my $factorName (keys %{$self->{'perplexity'}->{$sysname}}) { print CACHEFILE "$sysname $factorName " . $self->{'perplexity'}->{$sysname}->{$factorName} . "\n"; } } print "\nNN/ADJ WER/PWER\n"; foreach my $sysname (keys %{$self->{'nnAdjWERPWER'}}) { print CACHEFILE "$sysname " . join(' ', @{$self->{'nnAdjWERPWER'}->{$sysname}}) . "\n"; } print "\n"; close(CACHEFILE);}#load all scores present in the cache file into the appropriate fields of $self#arguments: none#return: nonesub loadCacheFile{ my $self = shift; if(!open(CACHEFILE, "<" . $self->{'cacheFilename'})) { warn "Corpus::loadCacheFile(): can't open '" . $self->{'cacheFilename'} . "' for read\n"; return; } my $mode = 'none'; while(my $line = <CACHEFILE>) { next if $line =~ /^[ \t\n\r\x0a]*$/; #anyone know why char 10 (0x0a) shows up on empty lines, at least on solaris? chomp $line; #check for start of section if($line =~ /File changetimes/) {$mode = 'ctime';} elsif($line =~ /BLEU scores/) {$mode = 'bleu';} elsif($line =~ /BLEU statistics/) {$mode = 'bstats';} elsif($line =~ /Statistical comparisons/) {$mode = 'cmp';} elsif($line =~ /Unknown-token counts/) {$mode = 'unk';} elsif($line =~ /WER scores/) {$mode = 'wer';} elsif($line =~ /Perplexity/) {$mode = 'ppl';} elsif($line =~ /NN\/ADJ WER\/PWER/) {$mode = 'nawp';} #get data when in a mode already elsif($mode eq 'ctime') { local ($fileExtension, $ctime) = split(/\s+/, $line); $self->{'fileCtimes'}->{$fileExtension} = $ctime; } elsif($mode eq 'bleu') { local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3); next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e'); if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};} if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];} my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest); print STDERR "bleu 1: " . join(', ', @{shift @stats}) . "\n"; print STDERR "bleu 2: " . join(' ', map {"{" . join(', ', @$_) . "}"} @stats) . "\n"; # $self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = shift @stats; # $self->{'bleuScores'}->{$sysname}->{$factorName}->[1] = \@stats; } elsif($mode eq 'bstats') { local ($sysname, $factorName, $rest) = split(/\s+/, $line, 3); next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e'); if(!exists $self->{'bleuConfidence'}->{$sysname}) {$self->{'bleuConfidence'}->{$sysname} = {};} if(!exists $self->{'bleuConfidence'}->{$sysname}->{$factorName}) {$self->{'bleuConfidence'}->{$sysname}->{$factorName} = [[], []];} my @stats = map {my @tmp = split(/\s+/, $_); \@tmp;} split(/;/, $rest); $self->{'bleuConfidence'}->{$sysname}->{$factorName}->[0] = shift @stats; $self->{'bleuConfidence'}->{$sysname}->{$factorName}->[1] = \@stats; } elsif($mode eq 'cmp') { local ($sysname1, $sysname2, $factorName, $rest) = split(/\s+/, $line, 4); next if !$self->cacheIsCurrentForFile($sysname1) || !$self->cacheIsCurrentForFile($sysname2) || !$self->cacheIsCurrentForFile('e'); if(!exists $self->{'comparisonStats'}->{$sysname1}) {$self->{'comparisonStats'}->{$sysname1} = {};} if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2} = {};} if(!exists $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName}) {$self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = [];} my @stats = map {my @x = split(' ', $_); \@x} split(/;/, $rest); $self->{'comparisonStats'}->{$sysname1}->{$sysname2}->{$factorName} = \@stats;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -