📄 corpus.pm
字号:
}
elsif($mode eq 'unk')
{
local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line);
next if !$self->cacheIsCurrentForFile('f') || !$self->cacheIsCurrentForFile("pt_$factorName");
if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename)
{
$self->{'unknownCount'}->{$factorName} = $unknownCount;
$self->{'totalTokens'} = $totalCount;
}
}
elsif($mode eq 'wer')
{
local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); #werType is 'sysoutWER' or 'sysoutPWER'
next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e');
$details =~ /^([^;]*);(.*)/;
my @sentenceWERs = split(/\s+/, $1);
if(!exists $self->{$werType}->{$sysname}) {$self->{$werType}->{$sysname} = {};}
$self->{$werType}->{$sysname}->{$factorName} = [$totalWER, \@sentenceWERs, []];
my @indexLists = split(/;/, $2);
for(my $i = 0; $i < scalar(@sentenceWERs); $i++)
{
my @indices = grep(/\S/, split(/\s+/, $indexLists[$i])); #find all nonempty tokens
$self->{$werType}->{$sysname}->{$factorName}->[2] = \@indices;
}
}
elsif($mode eq 'ppl')
{
local ($sysname, $factorName, $perplexity) = split(/\s+/, $line);
next if !$self->cacheIsCurrentForFile($sysname);
if(!exists $self->{'perplexity'}->{$sysname}) {$self->{'perplexity'}->{$sysname} = {};}
$self->{'perplexity'}->{$sysname}->{$factorName} = $perplexity;
}
elsif($mode eq 'nawp')
{
local ($sysname, @scores) = split(/\s+/, $line);
next if !$self->cacheIsCurrentForFile($sysname);
$self->{'nnAdjWERPWER'}->{$sysname} = \@scores;
}
}
close(CACHEFILE);
}
#arguments: cache type ('bleu' | ...), system name, factor name
#return: none
sub flushCache
{
my ($self, $cacheType, $sysname, $factorName) = @_;
if($cacheType eq 'bleu')
{
if(defined($self->{'bleuScores'}->{$sysname}) && defined($self->{'bleuScores'}->{$sysname}->{$factorName}))
{
delete $self->{'bleuScores'}->{$sysname}->{$factorName};
}
}
}
#arguments: file extension
#return: whether (0/1) our cache for the given file is at least as recent as the file
sub cacheIsCurrentForFile
{
my ($self, $ext) = @_;
return 0 if !exists $self->{'fileCtimes'}->{$ext} ;
my @liveStats = stat($self->{'corpusName'} . ".$ext");
return ($liveStats[9] <= $self->{'fileCtimes'}->{$ext}) ? 1 : 0;
}
##### utils #####
#arguments: a, b (scalars)
sub min
{
my ($a, $b) = @_;
return ($a < $b) ? $a : $b;
}
#arguments: a, b (scalars)
sub max
{
my ($a, $b) = @_;
return ($a > $b) ? $a : $b;
}
#arguments: x
sub my_log
{
return -9999999999 unless $_[0];
return log($_[0]);
}
#arguments: x
sub round
{
my $x = shift;
if($x - int($x) < .5) {return int($x);}
return int($x) + 1;
}
#return an approximation of the p-value for a given t FOR A HARDCODED NUMBER OF DEGREES OF FREEDOM
# (IF YOU CHANGE THIS HARDCODED NUMBER YOU MUST ALSO CHANGE statisticallyTestBLEUResults() and getLowerBoundPValue() )
#arguments: the t statistic, $t
#return: a lower bound on the probability mass outside (beyond) +/-$t in the t distribution
#
#for a wonderful t-distribution calculator, see <http://math.uc.edu/~brycw/classes/148/tables.htm#t>. UC.edu is Cincinnati.
sub getLowerBoundPValue
{
my $t = abs(shift);
#encode various known p-values for ###### DOF = 29 ######
my %t2p = #since we're comparing (hopefully) very similar values, this chart is weighted toward the low end of the t-stat
(
0.0063 => .995,
0.0126 => .99,
0.0253 => .98,
0.0380 => .97,
0.0506 => .96,
0.0633 => .95,
0.0950 => .925,
0.127 => .9,
0.191 => .85,
0.256 => .8,
0.389 => .7,
0.530 => .6,
0.683 => .5,
0.854 => .4,
1.055 => .3,
1.311 => .2,
1.699 => .1
);
foreach my $tCmp (sort keys %t2p) {return $t2p{$tCmp} if $t <= $tCmp;}
return 0; #loosest bound ever! groovy, man
}
#arguments: the t statistic, $t
#return: an upper bound on the probability mass outside (beyond) +/-$t in the t distribution
sub getUpperBoundPValue
{
my $t = abs(shift);
#encode various known p-values for ###### DOF = 29 ######
my %t2p =
(
4.506 => .0001,
4.254 => .0002,
3.918 => .0005,
3.659 => .001,
3.396 => .002,
3.038 => .005,
2.756 => .01,
2.462 => .02,
2.045 => .05,
1.699 => .1,
1.311 => .2,
0.683 => .5
);
foreach my $tCmp (reverse sort keys %t2p) {return $t2p{$tCmp} if $t >= $tCmp;}
return 1; #loosest bound ever!
}
#arguments: n, r
#return: binomial coefficient for p = .5 (ie nCr * (1/2)^n)
sub binCoeff
{
my ($n, $r) = @_;
my $coeff = 1;
for(my $i = $r + 1; $i <= $n; $i++) {$coeff *= $i; $coeff /= ($i - $r);}
return $coeff * (.5 ** $n);
}
#throw if the given factor doesn't have an index defined
#arguments: factor name
#return: none
sub ensureFactorPosDefined
{
my ($self, $factorName) = @_;
if(!defined($self->{'factorIndices'}->{$factorName}))
{
throw Error::Simple(-text => "Corpus: no index known for factor '$factorName'\n");
}
}
#throw if the filename field corresponding to the argument hasn't been defined
#arguments: 'truth' | 'input' | a system name
#return: none
sub ensureFilenameDefined
{
my ($self, $sysname) = @_;
if($sysname eq 'truth' || $sysname eq 'input')
{
if(!defined($self->{"${sysname}Filename"}))
{
throw Error::Simple(-text => "Corpus: no $sysname corpus defined\n");
}
}
else
{
if(!defined($self->{'sysoutFilenames'}->{$sysname}))
{
throw Error::Simple(-text => "Corpus: no system $sysname defined\n");
}
}
}
#throw if there isn't a defined phrase-table filename for the given factor
#arguments: factor name
#return: none
sub ensurePhraseTableDefined
{
my ($self, $factorName) = @_;
if(!defined($self->{'phraseTableFilenames'}->{$factorName}))
{
throw Error::Simple(-text => "Corpus: no phrase table defined for factor '$factorName'\n");
}
}
#search current directory for files with our corpus name as basename and set filename fields of $self
#arguments: hashref of filenames to descriptions
#return: none
sub locateFiles
{
my ($self, $refDescs) = @_;
open(DIR, "ls -x1 . |") or die "Corpus::locateFiles(): couldn't list current directory\n";
my $corpusName = $self->{'corpusName'};
while(my $filename = <DIR>)
{
chop $filename; #remove \n
if($filename =~ /^$corpusName\.(.*)$/)
{
my $ext = $1;
if($ext eq 'e') {$self->{'truthFilename'} = $filename;}
elsif($ext eq 'f') {$self->{'inputFilename'} = $filename;}
elsif($ext =~ /pt_(.*)/) {$self->{'phraseTableFilenames'}->{$1} = $filename;}
else {$self->{'sysoutFilenames'}->{$ext} = $filename;}
if(defined($refDescs->{$filename}))
{
$self->{'fileDescriptions'}->{$filename} = $refDescs->{$filename};
}
}
}
close(DIR);
}
#arguments: type ('truth' | 'input' | a string to represent a system output), filename
#pre: filename exists
#return: none
sub loadSentences
{
my ($self, $sysname, $filename) = @_;
#if the sentences are already loaded, leave them be
if(exists $self->{$sysname} && scalar(@{$self->{$sysname}}) > 0) {return;}
$self->{$sysname} = [];
$self->{'tokenCount'}->{$sysname} = 0;
open(INFILE, "<$filename") or die "Corpus::load(): couldn't open '$filename' for read\n";
while(my $line = <INFILE>)
{
my @words = split(/\s+/, $line);
$self->{'tokenCount'}->{$sysname} += scalar(@words);
my $refFactors = [];
foreach my $word (@words)
{
my @factors = split(/\|/, $word);
push @$refFactors, \@factors;
}
push @{$self->{$sysname}}, $refFactors;
}
close(INFILE);
}
#free the memory used for the given corpus (but NOT any associated calculations, eg WER)
#arguments: type ('truth' | 'input' | a string to represent a system output)
#return: none
sub releaseSentences
{
# my ($self, $sysname) = @_;
# $self->{$sysname} = [];
}
#arguments: factor name
#return: none
#throw if we don't have a filename for the given phrase table
sub loadPhraseTable
{
my ($self, $factorName) = @_;
$self->ensurePhraseTableDefined($factorName);
my $filename = $self->{'phraseTableFilenames'}->{$factorName};
open(PTABLE, "<$filename") or die "couldn't open '$filename' for read\n";
$self->{'phraseTables'}->{$factorName} = {}; #create ref to phrase table (hash of strings, for source phrases, to anything whatsoever)
#assume the table is sorted so that duplicate source phrases will be consecutive
while(my $line = <PTABLE>)
{
my @phrases = split(/\s*\|\|\|\s*/, $line, 2);
$self->{'phraseTables'}->{$factorName}->{$phrases[0]} = 0; #just so that it's set to something
}
close(PTABLE);
}
#arguments: factor name
#return: none
sub releasePhraseTable
{
my ($self, $factorName) = @_;
$self->{'phraseTables'}->{$factorName} = {};
}
#arguments: name of list ('nounAndAdj' | ...)
#return: arrayref of strings (postags)
sub getPOSTagList
{
my ($self, $listname) = @_;
##### assume PTB tagset #####
if($listname eq 'nounAndAdj') {return ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'];}
# if($listname eq '') {return [];}
}
#arguments: list to be filtered (arrayref of arrayrefs of factor strings), desired factor index, arrayref of allowable values
#return: filtered list as array of arrayrefs of factor strings
sub filterFactors
{
my ($self, $refFullList, $index, $refFactorValues) = @_;
my $valuesRegex = join("|", @$refFactorValues);
my @filteredList = ();
foreach my $factors (@$refFullList)
{
if($factors->[$index] =~ m/$valuesRegex/)
{
push @filteredList, $factors;
}
}
return @filteredList;
}
#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use
#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words
sub corpusWER
{
my ($self, $refSysOutput, $refTruth, $index) = @_;
my ($totWER, $sentenceWER, $errIndices) = (0, [], []);
for(my $i = 0; $i < scalar(@$refSysOutput); $i++)
{
my ($sentWER, $indices) = $self->sentenceWER($refSysOutput->[$i], $refTruth->[$i], $index);
$totWER += $sentWER;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -