📄 corpus.pm

📁 moses开源的机器翻译系统
💻 PM
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
#package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole
#Evan Herbst, 7 / 25 / 06

package Corpus;
BEGIN
{
	push @INC, "../perllib"; #for Error.pm
}
use Error;

return 1;

###########################################################################################################################

##### 'our' variables are available outside the package #####
#all factor names used should be in this list, just in case
our @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph');

#constructor
#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), formatted string with various config info (-info_line)
sub new
{
	my $class = shift;
	my %args = @_; #turn the remainder of @_ into a hash
	my ($corpusName, $refFileDescs, $infoLine) = ($args{'-name'}, $args{'-descriptions'}, $args{'-info_line'});
	my ($factorList, $inputLingmodels, $outputLingmodels) = split(/\s*:\s*/, $infoLine);
	my $self = {};
	$self->{'corpusName'} = $corpusName;
	$self->{'truth'} = []; #arrayref of arrayrefs of factors
	$self->{'input'} = []; #same; also same for any system outputs that get loaded
	$self->{'tokenCount'} = {}; #sysname => number of tokens in file
	$self->{'truthFilename'} = "";
	$self->{'inputFilename'} = "";
	$self->{'sysoutFilenames'} = {}; #hashref of (string => string) for (system name, filename)
	$self->{'phraseTableFilenames'} = {}; #factor name => filename
	$self->{'fileCtimes'} = {}; #file ID of some kind => changetime in seconds
	$self->{'factorIndices'} = {}; #factor name => index
	my @factors = split(/\s+/, $factorList);
	for(my $i = 0; $i < scalar(@factors); $i++)
	{
		$self->{'factorIndices'}->{$factors[$i]} = $i;
	}
	$self->{'inputLMs'} = {}; #factor name => lingmodel filename
	$self->{'outputLMs'} = {};
	foreach my $lmInfo (split(/\s*,\s*/, $inputLingmodels))
	{
		my @tokens = split(/\s+/, $lmInfo);
		$self->{'inputLMs'}->{$tokens[0]} = $tokens[1];
	}
	foreach my $lmInfo (split(/\s*,\s*/, $outputLingmodels))
	{
		my @tokens = split(/\s+/, $lmInfo);
		$self->{'outputLMs'}->{$tokens[0]} = $tokens[1];
	}
	$self->{'phraseTables'} = {}; #factor name (from @FACTORNAMES) => hashref of source phrases to anything; used for unknown-word counting
	$self->{'unknownCount'} = {}; #factor name => count of unknown tokens in input
	$self->{'sysoutWER'} = {}; #system name => (factor name => arrayref with system output total WER and arrayref of WER scores for individual sysout sentences wrt truth)
	$self->{'sysoutPWER'} = {}; #similarly
	$self->{'nnAdjWERPWER'} = {}; #system name => arrayref of [normalized WER, normalized PWER]
	$self->{'perplexity'} = {}; #system name => (factor name => perplexity raw score)
	$self->{'fileDescriptions'} = {}; #filename associated with us => string description of file
	$self->{'bleuScores'} = {}; #system name => (factor name => arrayref of (overall score, arrayref of per-sentence scores) )
	$self->{'bleuConfidence'} = {}; #system name => (factor name => arrayrefs holding statistical test data on BLEU scores)
	$self->{'subsetBLEUstats'} = {}; #system name => (factor name => n-gram precisions and lengths for independent corpus subsets)
	$self->{'comparisonStats'} = {}; #system name 1 => (system name 2 => (factor name => p-values, and indices of better system, for all tests used))
	$self->{'cacheFilename'} = "cache/$corpusName.cache"; #all memory of various scores is stored here
	bless $self, $class;
	$self->locateFiles($refFileDescs); #find all relevant files in the current directory; set filenames and descriptions
	$self->loadCacheFile();
	print STDERR "on load:\n";
	$self->printDetails();
	return $self;
}

#arguments: filename
#return: description string
#throw if filename doesn't belong to this corpus
sub getFileDescription
{
	my ($self, $filename) = @_;
	if(!defined($self->{'fileDescriptions'}->{$filename}))
	{
		throw Error::Simple(-text => "Corpus::getFileDescription(): invalid filename '$filename'\n");
	}
	return $self->{'fileDescriptions'}->{$filename};
}

#arguments: none
#return: list of system names (NOT including 'input', 'truth' and other special cases)
sub getSystemNames
{
	my $self = shift;
	return keys %{$self->{'sysoutFilenames'}};
}

#calculate the number of unknown factor values for the given factor in the input file
#arguments: factor name
#return: unknown factor count, total factor count (note the total doesn't depend on the factor)
#throw if we don't have an input file or a phrase table for the given factor defined or if there's no index known for the given factor
sub calcUnknownTokens
{
	my ($self, $factorName) = @_;
	#check in-memory cache first
	if(exists $self->{'unknownCount'}->{$factorName} && exists $self->{'tokenCount'}->{'input'})
	{
		return ($self->{'unknownCount'}->{$factorName}, $self->{'tokenCount'}->{'input'});
	}
	warn "calcing unknown tokens\n";
	
	$self->ensureFilenameDefined('input');
	$self->ensurePhraseTableDefined($factorName);
	$self->ensureFactorPosDefined($factorName);
	$self->loadSentences('input', $self->{'inputFilename'});
	$self->loadPhraseTable($factorName);

	#count unknown and total words
	my ($unknownTokens, $totalTokens) = (0, 0);
	my $factorIndex = $self->{'factorIndices'}->{$factorName};
	foreach my $sentence (@{$self->{'input'}})
	{
		$totalTokens += scalar(@$sentence);
		foreach my $word (@$sentence)
		{
			if(!defined($self->{'phraseTables'}->{$factorName}->{$word->[$factorIndex]}))
			{
				$unknownTokens++;
			}
		}
	}
	$self->{'unknownCount'}->{$factorName} = $unknownTokens;
	$self->{'tokenCount'}->{'input'} = $totalTokens;
	
	return ($unknownTokens, $totalTokens);
}

#arguments: system name
#return: (WER, PWER) for nouns and adjectives in given system wrt truth
#throw if given system or truth is not set or if index of 'surf' or 'pos' hasn't been specified
sub calcNounAdjWER_PWERDiff
{
	my ($self, $sysname) = @_;
	#check in-memory cache first
	if(exists $self->{'nnAdjWERPWER'}->{$sysname})
	{
		return @{$self->{'nnAdjWERPWER'}->{$sysname}};
	}
	warn "calcing NN/JJ PWER/WER\n";
	
	$self->ensureFilenameDefined('truth');
	$self->ensureFilenameDefined($sysname);
	$self->ensureFactorPosDefined('surf');
	$self->ensureFactorPosDefined('pos');
	$self->loadSentences('truth', $self->{'truthFilename'});
	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
	#find nouns and adjectives and score them
	my ($werScore, $pwerScore) = (0, 0);
	my $nnNadjTags = $self->getPOSTagList('nounAndAdj');
	for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)
	{
		my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);
		my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);
		my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});
		$werScore += $sentWer;
		($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});
		$pwerScore += $sentWer;
	}
	
	#unhog memory
	$self->releaseSentences('truth');
	$self->releaseSentences($sysname);
	$self->{'nnAdjWERPWER'}->{$sysname} = [$werScore / $self->{'tokenCount'}->{'truth'}, $pwerScore / $self->{'tokenCount'}->{'truth'}];
	return @{$self->{'nnAdjWERPWER'}->{$sysname}};
}

#calculate detailed WER statistics and put them into $self
#arguments: system name, factor name to consider (default 'surf', surface form)
#return: overall surface WER for given system (w/o filtering)
#throw if given system or truth is not set or if index of factor name hasn't been specified
sub calcOverallWER
{
	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
	if(scalar(@_) > 0) {$factorName = shift;}
	#check in-memory cache first
	if(exists $self->{'sysoutWER'}->{$sysname}->{$factorName})
	{
		return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0];
	}
	warn "calcing WER\n";
	
	$self->ensureFilenameDefined('truth');
	$self->ensureFilenameDefined($sysname);
	$self->ensureFactorPosDefined($factorName);
	$self->loadSentences('truth', $self->{'truthFilename'});
	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
	
	my ($wer, $swers, $indices) = $self->corpusWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName});
	$self->{'sysoutWER'}->{$sysname}->{$factorName} = [$wer, $swers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence
	
	#unhog memory
	$self->releaseSentences('truth');
	$self->releaseSentences($sysname);
	return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};
}

#calculate detailed PWER statistics and put them into $self
#arguments: system name, factor name to consider (default 'surf')
#return: overall surface PWER for given system (w/o filtering)
#throw if given system or truth is not set or if index of factor name hasn't been specified
sub calcOverallPWER
{
	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
	if(scalar(@_) > 0) {$factorName = shift;}
	#check in-memory cache first
	if(exists $self->{'sysoutPWER'}->{$sysname}->{$factorName})
	{
		return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0];
	}
	warn "calcing PWER\n";
	
	$self->ensureFilenameDefined('truth');
	$self->ensureFilenameDefined($sysname);
	$self->ensureFactorPosDefined($factorName);
	$self->loadSentences('truth', $self->{'truthFilename'});
	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
	
	my ($pwer, $spwers, $indices) = $self->corpusPWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName});
	$self->{'sysoutPWER'}->{$sysname}->{$factorName} = [$pwer, $spwers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence
	
	#unhog memory
	$self->releaseSentences('truth');
	$self->releaseSentences($sysname);
	return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};
}

#arguments: system name, factor name to consider (default 'surf')
#return: array of (BLEU score, n-gram precisions, brevity penalty)
sub calcBLEU
{
	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
	if(scalar(@_) > 0) {$factorName = shift;}
	#check in-memory cache first
	if(exists $self->{'bleuScores'}->{$sysname} && exists $self->{'bleuScores'}->{$sysname}->{$factorName})
	{
		return $self->{'bleuScores'}->{$sysname}->{$factorName};
	}
	warn "calcing BLEU\n";
	
	$self->ensureFilenameDefined('truth');
	$self->ensureFilenameDefined($sysname);
	$self->ensureFactorPosDefined($factorName);
	$self->loadSentences('truth', $self->{'truthFilename'});
	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});
	
	#score structure: various total scores, arrayref of by-sentence score arrays
	if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};}
	if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}
	
	my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	my $factorIndex = $self->{'factorIndices'}->{$factorName};
	for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)
	{
		my ($truthSentence, $sysoutSentence) = ($self->{'truth'}->[$i], $self->{$sysname}->[$i]);
		my ($unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength) = 
				$self->sentenceBLEU($truthSentence, $sysoutSentence, $factorIndex, 0); #last argument is whether to debug-print
		push @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}, [$unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength];
		$good1 += $unigood; $tot1 += $unicount;
		$good2 += $bigood; $tot2 += $bicount;
		$good3 += $trigood; $tot3 += $tricount;
		$good4 += $quadrugood; $tot4 += $quadrucount;
		$totCLength += $cLength;
		$totRLength += $rLength;
	}
	my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength);
	my ($pct1, $pct2, $pct3, $pct4) = ($tot1 == 0 ? -1 : $good1 / $tot1, $tot2 == 0 ? -1 : $good2 / $tot2, 
													$tot3 == 0 ? -1 : $good3 / $tot3, $tot4 == 0 ? -1 : $good4 / $tot4);
	my ($logsum, $logcount) = (0, 0);
	if($tot1 > 0) {$logsum += my_log($pct1); $logcount++;}
	if($tot2 > 0) {$logsum += my_log($pct2); $logcount++;}
	if($tot3 > 0) {$logsum += my_log($pct3); $logcount++;}
	if($tot4 > 0) {$logsum += my_log($pct4); $logcount++;}
	my $bleu = $brevity * exp($logsum / $logcount);
	$self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity];
  
	#unhog memory
	$self->releaseSentences('truth');
	$self->releaseSentences($sysname);
	return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]};
}

#do t-tests on the whole-corpus n-gram precisions vs. the average precisions over a set number of disjoint subsets
#arguments: system name, factor name BLEU was run on (default 'surf')
#return: arrayref of [arrayref of p-values for overall precision vs. subset average, arrayrefs of [(lower, upper) 95% credible intervals for true overall n-gram precisions]]
#
#written to try to save memory
sub statisticallyTestBLEUResults
{
	my ($self, $sysname, $factorName) = (shift, shift, 'surf');
	if(scalar(@_) > 0) {$factorName = shift;}
	#check in-memory cache first
	if(exists $self->{'bleuConfidence'}->{$sysname} && exists $self->{'bleuConfidence'}->{$sysname}->{$factorName})
	{
		return $self->{'bleuConfidence'}->{$sysname}->{$factorName};
	}
	warn "performing consistency tests\n";
	
	my $k = 30; #HARDCODED NUMBER OF SUBSETS (WE DO k-FOLD CROSS-VALIDATION); IF YOU CHANGE THIS YOU MUST ALSO CHANGE getApproxPValue() and $criticalTStat
	my $criticalTStat = 2.045; #hardcoded value given alpha (.025 here) and degrees of freedom (= $k - 1) ########################################
	$self->ensureFilenameDefined('truth');
	$self->ensureFilenameDefined($sysname);
	$self->ensureFactorPosDefined($factorName);
	
	#ensure we have full-corpus BLEU results
	if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName})
	{
		$self->calcBLEU($sysname, $factorName);
	}
	if(!exists $self->{'subsetBLEUstats'}->{$sysname}) {$self->{'subsetBLEUstats'}->{$sysname} = {};}
	if(!exists $self->{'subsetBLEUstats'}->{$sysname}->{$factorName}) {$self->{'subsetBLEUstats'}->{$sysname}->{$factorName} = [];}
	
	#calculate n-gram precisions for each small subset
	my @sentenceStats = @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]};
	for(my $i = 0; $i < $k; $i++)
	{
		my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
		for(my $j = $i; $j < scalar(@sentenceStats); $j += $k) #subset #K consists of every Kth sentence
		{
			$good1 += $sentenceStats[$j]->[0]; $tot1 += $sentenceStats[$j]->[1];
			$good2 += $sentenceStats[$j]->[2]; $tot2 += $sentenceStats[$j]->[3];
			$good3 += $sentenceStats[$j]->[4]; $tot3 += $sentenceStats[$j]->[5];
			$good4 += $sentenceStats[$j]->[6]; $tot4 += $sentenceStats[$j]->[7];
			$sysoutLength += $sentenceStats[$j]->[8];
			$truthLength += $sentenceStats[$j]->[9];
		}
		push @{$self->{'subsetBLEUstats'}->{$sysname}->{$factorName}}, [$good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength];
	}
	my $subsetStats = $self->{'subsetBLEUstats'}->{$sysname}->{$factorName};
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -