📄 corpus.pm

📁 moses开源的机器翻译系统
💻 PM
📖 第 1 页 / 共 4 页
字号:
		}
		elsif($mode eq 'unk')
		{
			local ($factorName, $phraseTableFilename, $unknownCount, $totalCount) = split(' ', $line);
			next if !$self->cacheIsCurrentForFile('f') || !$self->cacheIsCurrentForFile("pt_$factorName");
			if(defined($self->{'phraseTableFilenames'}->{$factorName}) && $self->{'phraseTableFilenames'}->{$factorName} eq $phraseTableFilename)
			{
				$self->{'unknownCount'}->{$factorName} = $unknownCount;
				$self->{'totalTokens'} = $totalCount;
			}
		}
		elsif($mode eq 'wer')
		{
			local ($werType, $sysname, $factorName, $totalWER, $details) = split(/\s+/, $line, 5); #werType is 'sysoutWER' or 'sysoutPWER'
			next if !$self->cacheIsCurrentForFile($sysname) || !$self->cacheIsCurrentForFile('e');
			$details =~ /^([^;]*);(.*)/;
			my @sentenceWERs = split(/\s+/, $1);
			if(!exists $self->{$werType}->{$sysname}) {$self->{$werType}->{$sysname} = {};}
			$self->{$werType}->{$sysname}->{$factorName} = [$totalWER, \@sentenceWERs, []];
			my @indexLists = split(/;/, $2);
			for(my $i = 0; $i < scalar(@sentenceWERs); $i++)
			{
				my @indices = grep(/\S/, split(/\s+/, $indexLists[$i])); #find all nonempty tokens
				$self->{$werType}->{$sysname}->{$factorName}->[2] = \@indices;
			}
		}
		elsif($mode eq 'ppl')
		{
			local ($sysname, $factorName, $perplexity) = split(/\s+/, $line);
			next if !$self->cacheIsCurrentForFile($sysname);
			if(!exists $self->{'perplexity'}->{$sysname}) {$self->{'perplexity'}->{$sysname} = {};}
			$self->{'perplexity'}->{$sysname}->{$factorName} = $perplexity;
		}
		elsif($mode eq 'nawp')
		{
			local ($sysname, @scores) = split(/\s+/, $line);
			next if !$self->cacheIsCurrentForFile($sysname);
			$self->{'nnAdjWERPWER'}->{$sysname} = \@scores;
		}
	}
	close(CACHEFILE);
}

#arguments: cache type ('bleu' | ...), system name, factor name
#return: none
sub flushCache
{
	my ($self, $cacheType, $sysname, $factorName) = @_;
	if($cacheType eq 'bleu')
	{
		if(defined($self->{'bleuScores'}->{$sysname}) && defined($self->{'bleuScores'}->{$sysname}->{$factorName}))
		{
			delete $self->{'bleuScores'}->{$sysname}->{$factorName};
		}
	}
}

#arguments: file extension
#return: whether (0/1) our cache for the given file is at least as recent as the file
sub cacheIsCurrentForFile
{
	my ($self, $ext) = @_;
	return 0 if !exists $self->{'fileCtimes'}->{$ext} ;
	my @liveStats = stat($self->{'corpusName'} . ".$ext");
	return ($liveStats[9] <= $self->{'fileCtimes'}->{$ext}) ? 1 : 0;
}

##### utils #####
#arguments: a, b (scalars)
sub min
{
	my ($a, $b) = @_;
	return ($a < $b) ? $a : $b;
}
#arguments: a, b (scalars)
sub max
{
	my ($a, $b) = @_;
	return ($a > $b) ? $a : $b;
}
#arguments: x
sub my_log
{
  return -9999999999 unless $_[0];
  return log($_[0]);
}
#arguments: x
sub round
{
	my $x = shift;
	if($x - int($x) < .5) {return int($x);}
	return int($x) + 1;
}

#return an approximation of the p-value for a given t FOR A HARDCODED NUMBER OF DEGREES OF FREEDOM
# (IF YOU CHANGE THIS HARDCODED NUMBER YOU MUST ALSO CHANGE statisticallyTestBLEUResults() and getLowerBoundPValue() )
#arguments: the t statistic, $t
#return: a lower bound on the probability mass outside (beyond) +/-$t in the t distribution
#
#for a wonderful t-distribution calculator, see <http://math.uc.edu/~brycw/classes/148/tables.htm#t>. UC.edu is Cincinnati.
sub getLowerBoundPValue
{
	my $t = abs(shift);
	#encode various known p-values for ###### DOF = 29 ######
	my %t2p = #since we're comparing (hopefully) very similar values, this chart is weighted toward the low end of the t-stat
	(
		0.0063 => .995,
		0.0126 => .99,
		0.0253 => .98,
		0.0380 => .97,
		0.0506 => .96,
		0.0633 => .95,
		0.0950 => .925,
		0.127  => .9,
		0.191  => .85,
		0.256  => .8,
		0.389  => .7,
		0.530  => .6,
		0.683  => .5,
		0.854  => .4,
		1.055  => .3,
		1.311  => .2, 
		1.699  => .1
	);
	foreach my $tCmp (sort keys %t2p) {return $t2p{$tCmp} if $t <= $tCmp;}
	return 0; #loosest bound ever! groovy, man
}
#arguments: the t statistic, $t
#return: an upper bound on the probability mass outside (beyond) +/-$t in the t distribution
sub getUpperBoundPValue
{
	my $t = abs(shift);
	#encode various known p-values for ###### DOF = 29 ######
	my %t2p = 
	(
		4.506 => .0001,
		4.254 => .0002,
		3.918 => .0005,
		3.659 => .001,
		3.396 => .002,
		3.038 => .005,
		2.756 => .01,
		2.462 => .02,
		2.045 => .05,
		1.699 => .1,
		1.311 => .2,
		0.683 => .5
	);
	foreach my $tCmp (reverse sort keys %t2p) {return $t2p{$tCmp} if $t >= $tCmp;}
	return 1; #loosest bound ever!
}

#arguments: n, r
#return: binomial coefficient for p = .5 (ie nCr * (1/2)^n)
sub binCoeff
{
	my ($n, $r) = @_;
	my $coeff = 1;
	for(my $i = $r + 1; $i <= $n; $i++) {$coeff *= $i; $coeff /= ($i - $r);}
	return $coeff * (.5 ** $n);
}

#throw if the given factor doesn't have an index defined
#arguments: factor name
#return: none
sub ensureFactorPosDefined
{
	my ($self, $factorName) = @_;
	if(!defined($self->{'factorIndices'}->{$factorName}))
	{
		throw Error::Simple(-text => "Corpus: no index known for factor '$factorName'\n");
	}
}

#throw if the filename field corresponding to the argument hasn't been defined
#arguments: 'truth' | 'input' | a system name
#return: none
sub ensureFilenameDefined
{
	my ($self, $sysname) = @_;
	if($sysname eq 'truth' || $sysname eq 'input')
	{
		if(!defined($self->{"${sysname}Filename"}))
		{
			throw Error::Simple(-text => "Corpus: no $sysname corpus defined\n");
		}
	}
	else
	{
		if(!defined($self->{'sysoutFilenames'}->{$sysname}))
		{
			throw Error::Simple(-text => "Corpus: no system $sysname defined\n");
		}
	}
}

#throw if there isn't a defined phrase-table filename for the given factor
#arguments: factor name
#return: none
sub ensurePhraseTableDefined
{
	my ($self, $factorName) = @_;
	if(!defined($self->{'phraseTableFilenames'}->{$factorName}))
	{
		throw Error::Simple(-text => "Corpus: no phrase table defined for factor '$factorName'\n");
	}
}

#search current directory for files with our corpus name as basename and set filename fields of $self
#arguments: hashref of filenames to descriptions
#return: none
sub locateFiles
{
	my ($self, $refDescs) = @_;
	open(DIR, "ls -x1 . |") or die "Corpus::locateFiles(): couldn't list current directory\n";
	my $corpusName = $self->{'corpusName'};
	while(my $filename = <DIR>)
	{
		chop $filename; #remove \n
		if($filename =~ /^$corpusName\.(.*)$/)
		{
			my $ext = $1;
			if($ext eq 'e') {$self->{'truthFilename'} = $filename;}
			elsif($ext eq 'f') {$self->{'inputFilename'} = $filename;}
			elsif($ext =~ /pt_(.*)/) {$self->{'phraseTableFilenames'}->{$1} = $filename;}
			else {$self->{'sysoutFilenames'}->{$ext} = $filename;}
			if(defined($refDescs->{$filename}))
			{
				$self->{'fileDescriptions'}->{$filename} = $refDescs->{$filename};
			}
		}
	}
	close(DIR);
}

#arguments: type ('truth' | 'input' | a string to represent a system output), filename
#pre: filename exists
#return: none
sub loadSentences
{
	my ($self, $sysname, $filename) = @_;
	#if the sentences are already loaded, leave them be
	if(exists $self->{$sysname} && scalar(@{$self->{$sysname}}) > 0) {return;}
	
	$self->{$sysname} = [];
	$self->{'tokenCount'}->{$sysname} = 0;
	open(INFILE, "<$filename") or die "Corpus::load(): couldn't open '$filename' for read\n";
	while(my $line = <INFILE>)
	{
		my @words = split(/\s+/, $line);
		$self->{'tokenCount'}->{$sysname} += scalar(@words);
		my $refFactors = [];
		foreach my $word (@words)
		{
			my @factors = split(/\|/, $word);
			push @$refFactors, \@factors;
		}
		push @{$self->{$sysname}}, $refFactors;
	}
	close(INFILE);	
}

#free the memory used for the given corpus (but NOT any associated calculations, eg WER)
#arguments: type ('truth' | 'input' | a string to represent a system output)
#return: none
sub releaseSentences
{
#	my ($self, $sysname) = @_;
#	$self->{$sysname} = [];
}

#arguments: factor name
#return: none
#throw if we don't have a filename for the given phrase table
sub loadPhraseTable
{
	my ($self, $factorName) = @_;
	$self->ensurePhraseTableDefined($factorName);
	
	my $filename = $self->{'phraseTableFilenames'}->{$factorName};
	open(PTABLE, "<$filename") or die "couldn't open '$filename' for read\n";
	$self->{'phraseTables'}->{$factorName} = {}; #create ref to phrase table (hash of strings, for source phrases, to anything whatsoever)
	#assume the table is sorted so that duplicate source phrases will be consecutive
	while(my $line = <PTABLE>)
	{
		my @phrases = split(/\s*\|\|\|\s*/, $line, 2);
		$self->{'phraseTables'}->{$factorName}->{$phrases[0]} = 0; #just so that it's set to something
	}
	close(PTABLE);
}

#arguments: factor name
#return: none
sub releasePhraseTable
{
	my ($self, $factorName) = @_;
	$self->{'phraseTables'}->{$factorName} = {};
}

#arguments: name of list ('nounAndAdj' | ...)
#return: arrayref of strings (postags)
sub getPOSTagList
{
	my ($self, $listname) = @_;
	##### assume PTB tagset #####
	if($listname eq 'nounAndAdj') {return ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'];}
#	if($listname eq '') {return [];}
}

#arguments: list to be filtered (arrayref of arrayrefs of factor strings), desired factor index, arrayref of allowable values
#return: filtered list as array of arrayrefs of factor strings
sub filterFactors
{
	my ($self, $refFullList, $index, $refFactorValues) = @_;
	my $valuesRegex = join("|", @$refFactorValues);
	my @filteredList = ();
	foreach my $factors (@$refFullList)
	{
		if($factors->[$index] =~ m/$valuesRegex/)
		{
			push @filteredList, $factors;
		}
	}
	return @filteredList;
}

#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use
#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words
sub corpusWER
{
	my ($self, $refSysOutput, $refTruth, $index) = @_;
	my ($totWER, $sentenceWER, $errIndices) = (0, [], []);
	for(my $i = 0; $i < scalar(@$refSysOutput); $i++)
	{
		my ($sentWER, $indices) = $self->sentenceWER($refSysOutput->[$i], $refTruth->[$i], $index);
		$totWER += $sentWER;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -