📄 corpus.pm.svn-base
字号:
} elsif($mode eq 'unk') { local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line); next if !$self->cacheIsCurrentForFile('f') || !$self->cacheIsCurrentForFile("pt_$factorName"); if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename) { $self->{'unknownCount'}->{$factorName} = $unknownCount; $self->{'totalTokens'} = $totalCount; } } elsif($mode eq 'wer') { local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); #werType is 'sysoutWER' or 'sysoutPWER' next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e'); $details =~ /^([^;]*);(.*)/; my @sentenceWERs = split(/\s+/, $1); if(!exists $self->{$werType}->{$sysname}) {$self->{$werType}->{$sysname} = {};} $self->{$werType}->{$sysname}->{$factorName} = [$totalWER, \@sentenceWERs, []]; my @indexLists = split(/;/, $2); for(my $i = 0; $i < scalar(@sentenceWERs); $i++) { my @indices = grep(/\S/, split(/\s+/, $indexLists[$i])); #find all nonempty tokens $self->{$werType}->{$sysname}->{$factorName}->[2] = \@indices; } } elsif($mode eq 'ppl') { local ($sysname, $factorName, $perplexity) = split(/\s+/, $line); next if !$self->cacheIsCurrentForFile($sysname); if(!exists $self->{'perplexity'}->{$sysname}) {$self->{'perplexity'}->{$sysname} = {};} $self->{'perplexity'}->{$sysname}->{$factorName} = $perplexity; } elsif($mode eq 'nawp') { local ($sysname, @scores) = split(/\s+/, $line); next if !$self->cacheIsCurrentForFile($sysname); $self->{'nnAdjWERPWER'}->{$sysname} = \@scores; } } close(CACHEFILE);}#arguments: cache type ('bleu' | ...), system name, factor name#return: nonesub flushCache{ my ($self, $cacheType, $sysname, $factorName) = @_; if($cacheType eq 'bleu') { if(defined($self->{'bleuScores'}->{$sysname}) && defined($self->{'bleuScores'}->{$sysname}->{$factorName})) { delete $self->{'bleuScores'}->{$sysname}->{$factorName}; } }}#arguments: file extension#return: whether (0/1) our cache for the given file is at least as recent as the filesub cacheIsCurrentForFile{ my ($self, $ext) = @_; return 0 if !exists $self->{'fileCtimes'}->{$ext} ; my @liveStats = stat($self->{'corpusName'} . ".$ext"); return ($liveStats[9] <= $self->{'fileCtimes'}->{$ext}) ? 1 : 0;}##### utils ######arguments: a, b (scalars)sub min{ my ($a, $b) = @_; return ($a < $b) ? $a : $b;}#arguments: a, b (scalars)sub max{ my ($a, $b) = @_; return ($a > $b) ? $a : $b;}#arguments: xsub my_log{ return -9999999999 unless $_[0]; return log($_[0]);}#arguments: xsub round{ my $x = shift; if($x - int($x) < .5) {return int($x);} return int($x) + 1;}#return an approximation of the p-value for a given t FOR A HARDCODED NUMBER OF DEGREES OF FREEDOM# (IF YOU CHANGE THIS HARDCODED NUMBER YOU MUST ALSO CHANGE statisticallyTestBLEUResults() and getLowerBoundPValue() )#arguments: the t statistic, $t#return: a lower bound on the probability mass outside (beyond) +/-$t in the t distribution##for a wonderful t-distribution calculator, see <http://math.uc.edu/~brycw/classes/148/tables.htm#t>. UC.edu is Cincinnati.sub getLowerBoundPValue{ my $t = abs(shift); #encode various known p-values for ###### DOF = 29 ###### my %t2p = #since we're comparing (hopefully) very similar values, this chart is weighted toward the low end of the t-stat ( 0.0063 => .995, 0.0126 => .99, 0.0253 => .98, 0.0380 => .97, 0.0506 => .96, 0.0633 => .95, 0.0950 => .925, 0.127 => .9, 0.191 => .85, 0.256 => .8, 0.389 => .7, 0.530 => .6, 0.683 => .5, 0.854 => .4, 1.055 => .3, 1.311 => .2, 1.699 => .1 ); foreach my $tCmp (sort keys %t2p) {return $t2p{$tCmp} if $t <= $tCmp;} return 0; #loosest bound ever! groovy, man}#arguments: the t statistic, $t#return: an upper bound on the probability mass outside (beyond) +/-$t in the t distributionsub getUpperBoundPValue{ my $t = abs(shift); #encode various known p-values for ###### DOF = 29 ###### my %t2p = ( 4.506 => .0001, 4.254 => .0002, 3.918 => .0005, 3.659 => .001, 3.396 => .002, 3.038 => .005, 2.756 => .01, 2.462 => .02, 2.045 => .05, 1.699 => .1, 1.311 => .2, 0.683 => .5 ); foreach my $tCmp (reverse sort keys %t2p) {return $t2p{$tCmp} if $t >= $tCmp;} return 1; #loosest bound ever!}#arguments: n, r#return: binomial coefficient for p = .5 (ie nCr * (1/2)^n)sub binCoeff{ my ($n, $r) = @_; my $coeff = 1; for(my $i = $r + 1; $i <= $n; $i++) {$coeff *= $i; $coeff /= ($i - $r);} return $coeff * (.5 ** $n);}#throw if the given factor doesn't have an index defined#arguments: factor name#return: nonesub ensureFactorPosDefined{ my ($self, $factorName) = @_; if(!defined($self->{'factorIndices'}->{$factorName})) { throw Error::Simple(-text => "Corpus: no index known for factor '$factorName'\n"); }}#throw if the filename field corresponding to the argument hasn't been defined#arguments: 'truth' | 'input' | a system name#return: nonesub ensureFilenameDefined{ my ($self, $sysname) = @_; if($sysname eq 'truth' || $sysname eq 'input') { if(!defined($self->{"${sysname}Filename"})) { throw Error::Simple(-text => "Corpus: no $sysname corpus defined\n"); } } else { if(!defined($self->{'sysoutFilenames'}->{$sysname})) { throw Error::Simple(-text => "Corpus: no system $sysname defined\n"); } }}#throw if there isn't a defined phrase-table filename for the given factor#arguments: factor name#return: nonesub ensurePhraseTableDefined{ my ($self, $factorName) = @_; if(!defined($self->{'phraseTableFilenames'}->{$factorName})) { throw Error::Simple(-text => "Corpus: no phrase table defined for factor '$factorName'\n"); }}#search current directory for files with our corpus name as basename and set filename fields of $self#arguments: hashref of filenames to descriptions#return: nonesub locateFiles{ my ($self, $refDescs) = @_; open(DIR, "ls -x1 . |") or die "Corpus::locateFiles(): couldn't list current directory\n"; my $corpusName = $self->{'corpusName'}; while(my $filename = <DIR>) { chop $filename; #remove \n if($filename =~ /^$corpusName\.(.*)$/) { my $ext = $1; if($ext eq 'e') {$self->{'truthFilename'} = $filename;} elsif($ext eq 'f') {$self->{'inputFilename'} = $filename;} elsif($ext =~ /pt_(.*)/) {$self->{'phraseTableFilenames'}->{$1} = $filename;} else {$self->{'sysoutFilenames'}->{$ext} = $filename;} if(defined($refDescs->{$filename})) { $self->{'fileDescriptions'}->{$filename} = $refDescs->{$filename}; } } } close(DIR);}#arguments: type ('truth' | 'input' | a string to represent a system output), filename#pre: filename exists#return: nonesub loadSentences{ my ($self, $sysname, $filename) = @_; #if the sentences are already loaded, leave them be if(exists $self->{$sysname} && scalar(@{$self->{$sysname}}) > 0) {return;} $self->{$sysname} = []; $self->{'tokenCount'}->{$sysname} = 0; open(INFILE, "<$filename") or die "Corpus::load(): couldn't open '$filename' for read\n"; while(my $line = <INFILE>) { my @words = split(/\s+/, $line); $self->{'tokenCount'}->{$sysname} += scalar(@words); my $refFactors = []; foreach my $word (@words) { my @factors = split(/\|/, $word); push @$refFactors, \@factors; } push @{$self->{$sysname}}, $refFactors; } close(INFILE); }#free the memory used for the given corpus (but NOT any associated calculations, eg WER)#arguments: type ('truth' | 'input' | a string to represent a system output)#return: nonesub releaseSentences{# my ($self, $sysname) = @_;# $self->{$sysname} = [];}#arguments: factor name#return: none#throw if we don't have a filename for the given phrase tablesub loadPhraseTable{ my ($self, $factorName) = @_; $self->ensurePhraseTableDefined($factorName); my $filename = $self->{'phraseTableFilenames'}->{$factorName}; open(PTABLE, "<$filename") or die "couldn't open '$filename' for read\n"; $self->{'phraseTables'}->{$factorName} = {}; #create ref to phrase table (hash of strings, for source phrases, to anything whatsoever) #assume the table is sorted so that duplicate source phrases will be consecutive while(my $line = <PTABLE>) { my @phrases = split(/\s*\|\|\|\s*/, $line, 2); $self->{'phraseTables'}->{$factorName}->{$phrases[0]} = 0; #just so that it's set to something } close(PTABLE);}#arguments: factor name#return: nonesub releasePhraseTable{ my ($self, $factorName) = @_; $self->{'phraseTables'}->{$factorName} = {};}#arguments: name of list ('nounAndAdj' | ...)#return: arrayref of strings (postags)sub getPOSTagList{ my ($self, $listname) = @_; ##### assume PTB tagset ##### if($listname eq 'nounAndAdj') {return ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'];}# if($listname eq '') {return [];}}#arguments: list to be filtered (arrayref of arrayrefs of factor strings), desired factor index, arrayref of allowable values#return: filtered list as array of arrayrefs of factor stringssub filterFactors{ my ($self, $refFullList, $index, $refFactorValues) = @_; my $valuesRegex = join("|", @$refFactorValues); my @filteredList = (); foreach my $factors (@$refFullList) { if($factors->[$index] =~ m/$valuesRegex/) { push @filteredList, $factors; } } return @filteredList;}#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful wordssub corpusWER{ my ($self, $refSysOutput, $refTruth, $index) = @_; my ($totWER, $sentenceWER, $errIndices) = (0, [], []); for(my $i = 0; $i < scalar(@$refSysOutput); $i++) { my ($sentWER, $indices) = $self->sentenceWER($refSysOutput->[$i], $refTruth->[$i], $index); $totWER += $sentWER;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -