📄 corpus.pm.svn-base
字号:
#package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole#Evan Herbst, 7 / 25 / 06package Corpus;BEGIN{ push @INC, "../perllib"; #for Error.pm}use Error;return 1;################################################################################################################################ 'our' variables are available outside the package ######all factor names used should be in this list, just in caseour @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph');#constructor#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), formatted string with various config info (-info_line)sub new{ my $class = shift; my %args = @_; #turn the remainder of @_ into a hash my ($corpusName, $refFileDescs, $infoLine) = ($args{'-name'}, $args{'-descriptions'}, $args{'-info_line'}); my ($factorList, $inputLingmodels, $outputLingmodels) = split(/\s*:\s*/, $infoLine); my $self = {}; $self->{'corpusName'} = $corpusName; $self->{'truth'} = []; #arrayref of arrayrefs of factors $self->{'input'} = []; #same; also same for any system outputs that get loaded $self->{'tokenCount'} = {}; #sysname => number of tokens in file $self->{'truthFilename'} = ""; $self->{'inputFilename'} = ""; $self->{'sysoutFilenames'} = {}; #hashref of (string => string) for (system name, filename) $self->{'phraseTableFilenames'} = {}; #factor name => filename $self->{'fileCtimes'} = {}; #file ID of some kind => changetime in seconds $self->{'factorIndices'} = {}; #factor name => index my @factors = split(/\s+/, $factorList); for(my $i = 0; $i < scalar(@factors); $i++) { $self->{'factorIndices'}->{$factors[$i]} = $i; } $self->{'inputLMs'} = {}; #factor name => lingmodel filename $self->{'outputLMs'} = {}; foreach my $lmInfo (split(/\s*,\s*/, $inputLingmodels)) { my @tokens = split(/\s+/, $lmInfo); $self->{'inputLMs'}->{$tokens[0]} = $tokens[1]; } foreach my $lmInfo (split(/\s*,\s*/, $outputLingmodels)) { my @tokens = split(/\s+/, $lmInfo); $self->{'outputLMs'}->{$tokens[0]} = $tokens[1]; } $self->{'phraseTables'} = {}; #factor name (from @FACTORNAMES) => hashref of source phrases to anything; used for unknown-word counting $self->{'unknownCount'} = {}; #factor name => count of unknown tokens in input $self->{'sysoutWER'} = {}; #system name => (factor name => arrayref with system output total WER and arrayref of WER scores for individual sysout sentences wrt truth) $self->{'sysoutPWER'} = {}; #similarly $self->{'nnAdjWERPWER'} = {}; #system name => arrayref of [normalized WER, normalized PWER] $self->{'perplexity'} = {}; #system name => (factor name => perplexity raw score) $self->{'fileDescriptions'} = {}; #filename associated with us => string description of file $self->{'bleuScores'} = {}; #system name => (factor name => arrayref of (overall score, arrayref of per-sentence scores) ) $self->{'bleuConfidence'} = {}; #system name => (factor name => arrayrefs holding statistical test data on BLEU scores) $self->{'subsetBLEUstats'} = {}; #system name => (factor name => n-gram precisions and lengths for independent corpus subsets) $self->{'comparisonStats'} = {}; #system name 1 => (system name 2 => (factor name => p-values, and indices of better system, for all tests used)) $self->{'cacheFilename'} = "cache/$corpusName.cache"; #all memory of various scores is stored here bless $self, $class; $self->locateFiles($refFileDescs); #find all relevant files in the current directory; set filenames and descriptions $self->loadCacheFile(); print STDERR "on load:\n"; $self->printDetails(); return $self;}#arguments: filename#return: description string#throw if filename doesn't belong to this corpussub getFileDescription{ my ($self, $filename) = @_; if(!defined($self->{'fileDescriptions'}->{$filename})) { throw Error::Simple(-text => "Corpus::getFileDescription(): invalid filename '$filename'\n"); } return $self->{'fileDescriptions'}->{$filename};}#arguments: none#return: list of system names (NOT including 'input', 'truth' and other special cases)sub getSystemNames{ my $self = shift; return keys %{$self->{'sysoutFilenames'}};}#calculate the number of unknown factor values for the given factor in the input file#arguments: factor name#return: unknown factor count, total factor count (note the total doesn't depend on the factor)#throw if we don't have an input file or a phrase table for the given factor defined or if there's no index known for the given factorsub calcUnknownTokens{ my ($self, $factorName) = @_; #check in-memory cache first if(exists $self->{'unknownCount'}->{$factorName} && exists $self->{'tokenCount'}->{'input'}) { return ($self->{'unknownCount'}->{$factorName}, $self->{'tokenCount'}->{'input'}); } warn "calcing unknown tokens\n"; $self->ensureFilenameDefined('input'); $self->ensurePhraseTableDefined($factorName); $self->ensureFactorPosDefined($factorName); $self->loadSentences('input', $self->{'inputFilename'}); $self->loadPhraseTable($factorName); #count unknown and total words my ($unknownTokens, $totalTokens) = (0, 0); my $factorIndex = $self->{'factorIndices'}->{$factorName}; foreach my $sentence (@{$self->{'input'}}) { $totalTokens += scalar(@$sentence); foreach my $word (@$sentence) { if(!defined($self->{'phraseTables'}->{$factorName}->{$word->[$factorIndex]})) { $unknownTokens++; } } } $self->{'unknownCount'}->{$factorName} = $unknownTokens; $self->{'tokenCount'}->{'input'} = $totalTokens; return ($unknownTokens, $totalTokens);}#arguments: system name#return: (WER, PWER) for nouns and adjectives in given system wrt truth#throw if given system or truth is not set or if index of 'surf' or 'pos' hasn't been specifiedsub calcNounAdjWER_PWERDiff{ my ($self, $sysname) = @_; #check in-memory cache first if(exists $self->{'nnAdjWERPWER'}->{$sysname}) { return @{$self->{'nnAdjWERPWER'}->{$sysname}}; } warn "calcing NN/JJ PWER/WER\n"; $self->ensureFilenameDefined('truth'); $self->ensureFilenameDefined($sysname); $self->ensureFactorPosDefined('surf'); $self->ensureFactorPosDefined('pos'); $self->loadSentences('truth', $self->{'truthFilename'}); $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); #find nouns and adjectives and score them my ($werScore, $pwerScore) = (0, 0); my $nnNadjTags = $self->getPOSTagList('nounAndAdj'); for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++) { my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags); my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags); my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'}); $werScore += $sentWer; ($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'}); $pwerScore += $sentWer; } #unhog memory $self->releaseSentences('truth'); $self->releaseSentences($sysname); $self->{'nnAdjWERPWER'}->{$sysname} = [$werScore / $self->{'tokenCount'}->{'truth'}, $pwerScore / $self->{'tokenCount'}->{'truth'}]; return @{$self->{'nnAdjWERPWER'}->{$sysname}};}#calculate detailed WER statistics and put them into $self#arguments: system name, factor name to consider (default 'surf', surface form)#return: overall surface WER for given system (w/o filtering)#throw if given system or truth is not set or if index of factor name hasn't been specifiedsub calcOverallWER{ my ($self, $sysname, $factorName) = (shift, shift, 'surf'); if(scalar(@_) > 0) {$factorName = shift;} #check in-memory cache first if(exists $self->{'sysoutWER'}->{$sysname}->{$factorName}) { return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0]; } warn "calcing WER\n"; $self->ensureFilenameDefined('truth'); $self->ensureFilenameDefined($sysname); $self->ensureFactorPosDefined($factorName); $self->loadSentences('truth', $self->{'truthFilename'}); $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); my ($wer, $swers, $indices) = $self->corpusWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName}); $self->{'sysoutWER'}->{$sysname}->{$factorName} = [$wer, $swers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence #unhog memory $self->releaseSentences('truth'); $self->releaseSentences($sysname); return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};}#calculate detailed PWER statistics and put them into $self#arguments: system name, factor name to consider (default 'surf')#return: overall surface PWER for given system (w/o filtering)#throw if given system or truth is not set or if index of factor name hasn't been specifiedsub calcOverallPWER{ my ($self, $sysname, $factorName) = (shift, shift, 'surf'); if(scalar(@_) > 0) {$factorName = shift;} #check in-memory cache first if(exists $self->{'sysoutPWER'}->{$sysname}->{$factorName}) { return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0]; } warn "calcing PWER\n"; $self->ensureFilenameDefined('truth'); $self->ensureFilenameDefined($sysname); $self->ensureFactorPosDefined($factorName); $self->loadSentences('truth', $self->{'truthFilename'}); $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); my ($pwer, $spwers, $indices) = $self->corpusPWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName}); $self->{'sysoutPWER'}->{$sysname}->{$factorName} = [$pwer, $spwers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence #unhog memory $self->releaseSentences('truth'); $self->releaseSentences($sysname); return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};}#arguments: system name, factor name to consider (default 'surf')#return: array of (BLEU score, n-gram precisions, brevity penalty)sub calcBLEU{ my ($self, $sysname, $factorName) = (shift, shift, 'surf'); if(scalar(@_) > 0) {$factorName = shift;} #check in-memory cache first if(exists $self->{'bleuScores'}->{$sysname} && exists $self->{'bleuScores'}->{$sysname}->{$factorName}) { return $self->{'bleuScores'}->{$sysname}->{$factorName}; } warn "calcing BLEU\n"; $self->ensureFilenameDefined('truth'); $self->ensureFilenameDefined($sysname); $self->ensureFactorPosDefined($factorName); $self->loadSentences('truth', $self->{'truthFilename'}); $self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname}); #score structure: various total scores, arrayref of by-sentence score arrays if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};} if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];} my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0); my $factorIndex = $self->{'factorIndices'}->{$factorName}; for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++) { my ($truthSentence, $sysoutSentence) = ($self->{'truth'}->[$i], $self->{$sysname}->[$i]); my ($unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength) = $self->sentenceBLEU($truthSentence, $sysoutSentence, $factorIndex, 0); #last argument is whether to debug-print push @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}, [$unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength]; $good1 += $unigood; $tot1 += $unicount; $good2 += $bigood; $tot2 += $bicount; $good3 += $trigood; $tot3 += $tricount; $good4 += $quadrugood; $tot4 += $quadrucount; $totCLength += $cLength; $totRLength += $rLength; } my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength); my ($pct1, $pct2, $pct3, $pct4) = ($tot1 == 0 ? -1 : $good1 / $tot1, $tot2 == 0 ? -1 : $good2 / $tot2, $tot3 == 0 ? -1 : $good3 / $tot3, $tot4 == 0 ? -1 : $good4 / $tot4); my ($logsum, $logcount) = (0, 0); if($tot1 > 0) {$logsum += my_log($pct1); $logcount++;} if($tot2 > 0) {$logsum += my_log($pct2); $logcount++;} if($tot3 > 0) {$logsum += my_log($pct3); $logcount++;} if($tot4 > 0) {$logsum += my_log($pct4); $logcount++;} my $bleu = $brevity * exp($logsum / $logcount); $self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity]; #unhog memory $self->releaseSentences('truth'); $self->releaseSentences($sysname); return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]};}#do t-tests on the whole-corpus n-gram precisions vs. the average precisions over a set number of disjoint subsets#arguments: system name, factor name BLEU was run on (default 'surf')#return: arrayref of [arrayref of p-values for overall precision vs. subset average, arrayrefs of [(lower, upper) 95% credible intervals for true overall n-gram precisions]]##written to try to save memorysub statisticallyTestBLEUResults{ my ($self, $sysname, $factorName) = (shift, shift, 'surf'); if(scalar(@_) > 0) {$factorName = shift;} #check in-memory cache first if(exists $self->{'bleuConfidence'}->{$sysname} && exists $self->{'bleuConfidence'}->{$sysname}->{$factorName}) { return $self->{'bleuConfidence'}->{$sysname}->{$factorName}; } warn "performing consistency tests\n"; my $k = 30; #HARDCODED NUMBER OF SUBSETS (WE DO k-FOLD CROSS-VALIDATION); IF YOU CHANGE THIS YOU MUST ALSO CHANGE getApproxPValue() and $criticalTStat my $criticalTStat = 2.045; #hardcoded value given alpha (.025 here) and degrees of freedom (= $k - 1) ######################################## $self->ensureFilenameDefined('truth'); $self->ensureFilenameDefined($sysname); $self->ensureFactorPosDefined($factorName); #ensure we have full-corpus BLEU results if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) { $self->calcBLEU($sysname, $factorName); } if(!exists $self->{'subsetBLEUstats'}->{$sysname}) {$self->{'subsetBLEUstats'}->{$sysname} = {};} if(!exists $self->{'subsetBLEUstats'}->{$sysname}->{$factorName}) {$self->{'subsetBLEUstats'}->{$sysname}->{$factorName} = [];} #calculate n-gram precisions for each small subset my @sentenceStats = @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}; for(my $i = 0; $i < $k; $i++) { my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0); for(my $j = $i; $j < scalar(@sentenceStats); $j += $k) #subset #K consists of every Kth sentence { $good1 += $sentenceStats[$j]->[0]; $tot1 += $sentenceStats[$j]->[1]; $good2 += $sentenceStats[$j]->[2]; $tot2 += $sentenceStats[$j]->[3]; $good3 += $sentenceStats[$j]->[4]; $tot3 += $sentenceStats[$j]->[5]; $good4 += $sentenceStats[$j]->[6]; $tot4 += $sentenceStats[$j]->[7]; $sysoutLength += $sentenceStats[$j]->[8]; $truthLength += $sentenceStats[$j]->[9]; } push @{$self->{'subsetBLEUstats'}->{$sysname}->{$factorName}}, [$good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength]; } my $subsetStats = $self->{'subsetBLEUstats'}->{$sysname}->{$factorName};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -