📄 corpus.pm
字号:
#package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole
#Evan Herbst, 7 / 25 / 06
package Corpus;
BEGIN
{
push @INC, "../perllib"; #for Error.pm
}
use Error;
return 1;
###########################################################################################################################
##### 'our' variables are available outside the package #####
#all factor names used should be in this list, just in case
our @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph');
#constructor
#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), formatted string with various config info (-info_line)
sub new
{
my $class = shift;
my %args = @_; #turn the remainder of @_ into a hash
my ($corpusName, $refFileDescs, $infoLine) = ($args{'-name'}, $args{'-descriptions'}, $args{'-info_line'});
my ($factorList, $inputLingmodels, $outputLingmodels) = split(/\s*:\s*/, $infoLine);
my $self = {};
$self->{'corpusName'} = $corpusName;
$self->{'truth'} = []; #arrayref of arrayrefs of factors
$self->{'input'} = []; #same; also same for any system outputs that get loaded
$self->{'tokenCount'} = {}; #sysname => number of tokens in file
$self->{'truthFilename'} = "";
$self->{'inputFilename'} = "";
$self->{'sysoutFilenames'} = {}; #hashref of (string => string) for (system name, filename)
$self->{'phraseTableFilenames'} = {}; #factor name => filename
$self->{'fileCtimes'} = {}; #file ID of some kind => changetime in seconds
$self->{'factorIndices'} = {}; #factor name => index
my @factors = split(/\s+/, $factorList);
for(my $i = 0; $i < scalar(@factors); $i++)
{
$self->{'factorIndices'}->{$factors[$i]} = $i;
}
$self->{'inputLMs'} = {}; #factor name => lingmodel filename
$self->{'outputLMs'} = {};
foreach my $lmInfo (split(/\s*,\s*/, $inputLingmodels))
{
my @tokens = split(/\s+/, $lmInfo);
$self->{'inputLMs'}->{$tokens[0]} = $tokens[1];
}
foreach my $lmInfo (split(/\s*,\s*/, $outputLingmodels))
{
my @tokens = split(/\s+/, $lmInfo);
$self->{'outputLMs'}->{$tokens[0]} = $tokens[1];
}
$self->{'phraseTables'} = {}; #factor name (from @FACTORNAMES) => hashref of source phrases to anything; used for unknown-word counting
$self->{'unknownCount'} = {}; #factor name => count of unknown tokens in input
$self->{'sysoutWER'} = {}; #system name => (factor name => arrayref with system output total WER and arrayref of WER scores for individual sysout sentences wrt truth)
$self->{'sysoutPWER'} = {}; #similarly
$self->{'nnAdjWERPWER'} = {}; #system name => arrayref of [normalized WER, normalized PWER]
$self->{'perplexity'} = {}; #system name => (factor name => perplexity raw score)
$self->{'fileDescriptions'} = {}; #filename associated with us => string description of file
$self->{'bleuScores'} = {}; #system name => (factor name => arrayref of (overall score, arrayref of per-sentence scores) )
$self->{'bleuConfidence'} = {}; #system name => (factor name => arrayrefs holding statistical test data on BLEU scores)
$self->{'subsetBLEUstats'} = {}; #system name => (factor name => n-gram precisions and lengths for independent corpus subsets)
$self->{'comparisonStats'} = {}; #system name 1 => (system name 2 => (factor name => p-values, and indices of better system, for all tests used))
$self->{'cacheFilename'} = "cache/$corpusName.cache"; #all memory of various scores is stored here
bless $self, $class;
$self->locateFiles($refFileDescs); #find all relevant files in the current directory; set filenames and descriptions
$self->loadCacheFile();
print STDERR "on load:\n";
$self->printDetails();
return $self;
}
#arguments: filename
#return: description string
#throw if filename doesn't belong to this corpus
sub getFileDescription
{
my ($self, $filename) = @_;
if(!defined($self->{'fileDescriptions'}->{$filename}))
{
throw Error::Simple(-text => "Corpus::getFileDescription(): invalid filename '$filename'\n");
}
return $self->{'fileDescriptions'}->{$filename};
}
#arguments: none
#return: list of system names (NOT including 'input', 'truth' and other special cases)
sub getSystemNames
{
my $self = shift;
return keys %{$self->{'sysoutFilenames'}};
}
#calculate the number of unknown factor values for the given factor in the input file
#arguments: factor name
#return: unknown factor count, total factor count (note the total doesn't depend on the factor)
#throw if we don't have an input file or a phrase table for the given factor defined or if there's no index known for the given factor
sub calcUnknownTokens
{
my ($self, $factorName) = @_;
#check in-memory cache first
if(exists $self->{'unknownCount'}->{$factorName} && exists $self->{'tokenCount'}->{'input'})
{
return ($self->{'unknownCount'}->{$factorName}, $self->{'tokenCount'}->{'input'});
}
warn "calcing unknown tokens\n";
$self->ensureFilenameDefined('input');
$self->ensurePhraseTableDefined($factorName);
$self->ensureFactorPosDefined($factorName);
$self->loadSentences('input', $self->{'inputFilename'});
$self->loadPhraseTable($factorName);
#count unknown and total words
my ($unknownTokens, $totalTokens) = (0, 0);
my $factorIndex = $self->{'factorIndices'}->{$factorName};
foreach my $sentence (@{$self->{'input'}})
{
$totalTokens += scalar(@$sentence);
foreach my $word (@$sentence)
{
if(!defined($self->{'phraseTables'}->{$factorName}->{$word->[$factorIndex]}))
{
$unknownTokens++;
}
}
}
$self->{'unknownCount'}->{$factorName} = $unknownTokens;
$self->{'tokenCount'}->{'input'} = $totalTokens;
return ($unknownTokens, $totalTokens);
}
#arguments: system name
#return: (WER, PWER) for nouns and adjectives in given system wrt truth
#throw if given system or truth is not set or if index of 'surf' or 'pos' hasn't been specified
sub calcNounAdjWER_PWERDiff
{
my ($self, $sysname) = @_;
#check in-memory cache first
if(exists $self->{'nnAdjWERPWER'}->{$sysname})
{
return @{$self->{'nnAdjWERPWER'}->{$sysname}};
}
warn "calcing NN/JJ PWER/WER\n";
$self->ensureFilenameDefined('truth');
$self->ensureFilenameDefined($sysname);
$self->ensureFactorPosDefined('surf');
$self->ensureFactorPosDefined('pos');
$self->loadSentences('truth', $self->{'truthFilename'});
$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
#find nouns and adjectives and score them
my ($werScore, $pwerScore) = (0, 0);
my $nnNadjTags = $self->getPOSTagList('nounAndAdj');
for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)
{
my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);
my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);
my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});
$werScore += $sentWer;
($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});
$pwerScore += $sentWer;
}
#unhog memory
$self->releaseSentences('truth');
$self->releaseSentences($sysname);
$self->{'nnAdjWERPWER'}->{$sysname} = [$werScore / $self->{'tokenCount'}->{'truth'}, $pwerScore / $self->{'tokenCount'}->{'truth'}];
return @{$self->{'nnAdjWERPWER'}->{$sysname}};
}
#calculate detailed WER statistics and put them into $self
#arguments: system name, factor name to consider (default 'surf', surface form)
#return: overall surface WER for given system (w/o filtering)
#throw if given system or truth is not set or if index of factor name hasn't been specified
sub calcOverallWER
{
my ($self, $sysname, $factorName) = (shift, shift, 'surf');
if(scalar(@_) > 0) {$factorName = shift;}
#check in-memory cache first
if(exists $self->{'sysoutWER'}->{$sysname}->{$factorName})
{
return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0];
}
warn "calcing WER\n";
$self->ensureFilenameDefined('truth');
$self->ensureFilenameDefined($sysname);
$self->ensureFactorPosDefined($factorName);
$self->loadSentences('truth', $self->{'truthFilename'});
$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
my ($wer, $swers, $indices) = $self->corpusWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName});
$self->{'sysoutWER'}->{$sysname}->{$factorName} = [$wer, $swers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence
#unhog memory
$self->releaseSentences('truth');
$self->releaseSentences($sysname);
return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};
}
#calculate detailed PWER statistics and put them into $self
#arguments: system name, factor name to consider (default 'surf')
#return: overall surface PWER for given system (w/o filtering)
#throw if given system or truth is not set or if index of factor name hasn't been specified
sub calcOverallPWER
{
my ($self, $sysname, $factorName) = (shift, shift, 'surf');
if(scalar(@_) > 0) {$factorName = shift;}
#check in-memory cache first
if(exists $self->{'sysoutPWER'}->{$sysname}->{$factorName})
{
return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0];
}
warn "calcing PWER\n";
$self->ensureFilenameDefined('truth');
$self->ensureFilenameDefined($sysname);
$self->ensureFactorPosDefined($factorName);
$self->loadSentences('truth', $self->{'truthFilename'});
$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
my ($pwer, $spwers, $indices) = $self->corpusPWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName});
$self->{'sysoutPWER'}->{$sysname}->{$factorName} = [$pwer, $spwers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence
#unhog memory
$self->releaseSentences('truth');
$self->releaseSentences($sysname);
return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};
}
#arguments: system name, factor name to consider (default 'surf')
#return: array of (BLEU score, n-gram precisions, brevity penalty)
sub calcBLEU
{
my ($self, $sysname, $factorName) = (shift, shift, 'surf');
if(scalar(@_) > 0) {$factorName = shift;}
#check in-memory cache first
if(exists $self->{'bleuScores'}->{$sysname} && exists $self->{'bleuScores'}->{$sysname}->{$factorName})
{
return $self->{'bleuScores'}->{$sysname}->{$factorName};
}
warn "calcing BLEU\n";
$self->ensureFilenameDefined('truth');
$self->ensureFilenameDefined($sysname);
$self->ensureFactorPosDefined($factorName);
$self->loadSentences('truth', $self->{'truthFilename'});
$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
#score structure: various total scores, arrayref of by-sentence score arrays
if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};}
if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
my $factorIndex = $self->{'factorIndices'}->{$factorName};
for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)
{
my ($truthSentence, $sysoutSentence) = ($self->{'truth'}->[$i], $self->{$sysname}->[$i]);
my ($unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength) =
$self->sentenceBLEU($truthSentence, $sysoutSentence, $factorIndex, 0); #last argument is whether to debug-print
push @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}, [$unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength];
$good1 += $unigood; $tot1 += $unicount;
$good2 += $bigood; $tot2 += $bicount;
$good3 += $trigood; $tot3 += $tricount;
$good4 += $quadrugood; $tot4 += $quadrucount;
$totCLength += $cLength;
$totRLength += $rLength;
}
my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength);
my ($pct1, $pct2, $pct3, $pct4) = ($tot1 == 0 ? -1 : $good1 / $tot1, $tot2 == 0 ? -1 : $good2 / $tot2,
$tot3 == 0 ? -1 : $good3 / $tot3, $tot4 == 0 ? -1 : $good4 / $tot4);
my ($logsum, $logcount) = (0, 0);
if($tot1 > 0) {$logsum += my_log($pct1); $logcount++;}
if($tot2 > 0) {$logsum += my_log($pct2); $logcount++;}
if($tot3 > 0) {$logsum += my_log($pct3); $logcount++;}
if($tot4 > 0) {$logsum += my_log($pct4); $logcount++;}
my $bleu = $brevity * exp($logsum / $logcount);
$self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity];
#unhog memory
$self->releaseSentences('truth');
$self->releaseSentences($sysname);
return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]};
}
#do t-tests on the whole-corpus n-gram precisions vs. the average precisions over a set number of disjoint subsets
#arguments: system name, factor name BLEU was run on (default 'surf')
#return: arrayref of [arrayref of p-values for overall precision vs. subset average, arrayrefs of [(lower, upper) 95% credible intervals for true overall n-gram precisions]]
#
#written to try to save memory
sub statisticallyTestBLEUResults
{
my ($self, $sysname, $factorName) = (shift, shift, 'surf');
if(scalar(@_) > 0) {$factorName = shift;}
#check in-memory cache first
if(exists $self->{'bleuConfidence'}->{$sysname} && exists $self->{'bleuConfidence'}->{$sysname}->{$factorName})
{
return $self->{'bleuConfidence'}->{$sysname}->{$factorName};
}
warn "performing consistency tests\n";
my $k = 30; #HARDCODED NUMBER OF SUBSETS (WE DO k-FOLD CROSS-VALIDATION); IF YOU CHANGE THIS YOU MUST ALSO CHANGE getApproxPValue() and $criticalTStat
my $criticalTStat = 2.045; #hardcoded value given alpha (.025 here) and degrees of freedom (= $k - 1) ########################################
$self->ensureFilenameDefined('truth');
$self->ensureFilenameDefined($sysname);
$self->ensureFactorPosDefined($factorName);
#ensure we have full-corpus BLEU results
if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName})
{
$self->calcBLEU($sysname, $factorName);
}
if(!exists $self->{'subsetBLEUstats'}->{$sysname}) {$self->{'subsetBLEUstats'}->{$sysname} = {};}
if(!exists $self->{'subsetBLEUstats'}->{$sysname}->{$factorName}) {$self->{'subsetBLEUstats'}->{$sysname}->{$factorName} = [];}
#calculate n-gram precisions for each small subset
my @sentenceStats = @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]};
for(my $i = 0; $i < $k; $i++)
{
my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
for(my $j = $i; $j < scalar(@sentenceStats); $j += $k) #subset #K consists of every Kth sentence
{
$good1 += $sentenceStats[$j]->[0]; $tot1 += $sentenceStats[$j]->[1];
$good2 += $sentenceStats[$j]->[2]; $tot2 += $sentenceStats[$j]->[3];
$good3 += $sentenceStats[$j]->[4]; $tot3 += $sentenceStats[$j]->[5];
$good4 += $sentenceStats[$j]->[6]; $tot4 += $sentenceStats[$j]->[7];
$sysoutLength += $sentenceStats[$j]->[8];
$truthLength += $sentenceStats[$j]->[9];
}
push @{$self->{'subsetBLEUstats'}->{$sysname}->{$factorName}}, [$good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength];
}
my $subsetStats = $self->{'subsetBLEUstats'}->{$sysname}->{$factorName};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -