⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 corpus.pm.svn-base

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 SVN-BASE
📖 第 1 页 / 共 4 页
字号:
#package Corpus: hold a bunch of sentences in any language, with translation factors and stats about individual sentences and the corpus as a whole#Evan Herbst, 7 / 25 / 06package Corpus;BEGIN{	push @INC, "../perllib"; #for Error.pm}use Error;return 1;################################################################################################################################ 'our' variables are available outside the package ######all factor names used should be in this list, just in caseour @FACTORNAMES = ('surf', 'pos', 'lemma', 'stem', 'morph');#constructor#arguments: short corpus name (-name), hashref of filenames to descriptions (-descriptions), formatted string with various config info (-info_line)sub new{	my $class = shift;	my %args = @_; #turn the remainder of @_ into a hash	my ($corpusName, $refFileDescs, $infoLine) = ($args{'-name'}, $args{'-descriptions'}, $args{'-info_line'});	my ($factorList, $inputLingmodels, $outputLingmodels) = split(/\s*:\s*/, $infoLine);	my $self = {};	$self->{'corpusName'} = $corpusName;	$self->{'truth'} = []; #arrayref of arrayrefs of factors	$self->{'input'} = []; #same; also same for any system outputs that get loaded	$self->{'tokenCount'} = {}; #sysname => number of tokens in file	$self->{'truthFilename'} = "";	$self->{'inputFilename'} = "";	$self->{'sysoutFilenames'} = {}; #hashref of (string => string) for (system name, filename)	$self->{'phraseTableFilenames'} = {}; #factor name => filename	$self->{'fileCtimes'} = {}; #file ID of some kind => changetime in seconds	$self->{'factorIndices'} = {}; #factor name => index	my @factors = split(/\s+/, $factorList);	for(my $i = 0; $i < scalar(@factors); $i++)	{		$self->{'factorIndices'}->{$factors[$i]} = $i;	}	$self->{'inputLMs'} = {}; #factor name => lingmodel filename	$self->{'outputLMs'} = {};	foreach my $lmInfo (split(/\s*,\s*/, $inputLingmodels))	{		my @tokens = split(/\s+/, $lmInfo);		$self->{'inputLMs'}->{$tokens[0]} = $tokens[1];	}	foreach my $lmInfo (split(/\s*,\s*/, $outputLingmodels))	{		my @tokens = split(/\s+/, $lmInfo);		$self->{'outputLMs'}->{$tokens[0]} = $tokens[1];	}	$self->{'phraseTables'} = {}; #factor name (from @FACTORNAMES) => hashref of source phrases to anything; used for unknown-word counting	$self->{'unknownCount'} = {}; #factor name => count of unknown tokens in input	$self->{'sysoutWER'} = {}; #system name => (factor name => arrayref with system output total WER and arrayref of WER scores for individual sysout sentences wrt truth)	$self->{'sysoutPWER'} = {}; #similarly	$self->{'nnAdjWERPWER'} = {}; #system name => arrayref of [normalized WER, normalized PWER]	$self->{'perplexity'} = {}; #system name => (factor name => perplexity raw score)	$self->{'fileDescriptions'} = {}; #filename associated with us => string description of file	$self->{'bleuScores'} = {}; #system name => (factor name => arrayref of (overall score, arrayref of per-sentence scores) )	$self->{'bleuConfidence'} = {}; #system name => (factor name => arrayrefs holding statistical test data on BLEU scores)	$self->{'subsetBLEUstats'} = {}; #system name => (factor name => n-gram precisions and lengths for independent corpus subsets)	$self->{'comparisonStats'} = {}; #system name 1 => (system name 2 => (factor name => p-values, and indices of better system, for all tests used))	$self->{'cacheFilename'} = "cache/$corpusName.cache"; #all memory of various scores is stored here	bless $self, $class;	$self->locateFiles($refFileDescs); #find all relevant files in the current directory; set filenames and descriptions	$self->loadCacheFile();	print STDERR "on load:\n";	$self->printDetails();	return $self;}#arguments: filename#return: description string#throw if filename doesn't belong to this corpussub getFileDescription{	my ($self, $filename) = @_;	if(!defined($self->{'fileDescriptions'}->{$filename}))	{		throw Error::Simple(-text => "Corpus::getFileDescription(): invalid filename '$filename'\n");	}	return $self->{'fileDescriptions'}->{$filename};}#arguments: none#return: list of system names (NOT including 'input', 'truth' and other special cases)sub getSystemNames{	my $self = shift;	return keys %{$self->{'sysoutFilenames'}};}#calculate the number of unknown factor values for the given factor in the input file#arguments: factor name#return: unknown factor count, total factor count (note the total doesn't depend on the factor)#throw if we don't have an input file or a phrase table for the given factor defined or if there's no index known for the given factorsub calcUnknownTokens{	my ($self, $factorName) = @_;	#check in-memory cache first	if(exists $self->{'unknownCount'}->{$factorName} && exists $self->{'tokenCount'}->{'input'})	{		return ($self->{'unknownCount'}->{$factorName}, $self->{'tokenCount'}->{'input'});	}	warn "calcing unknown tokens\n";		$self->ensureFilenameDefined('input');	$self->ensurePhraseTableDefined($factorName);	$self->ensureFactorPosDefined($factorName);	$self->loadSentences('input', $self->{'inputFilename'});	$self->loadPhraseTable($factorName);	#count unknown and total words	my ($unknownTokens, $totalTokens) = (0, 0);	my $factorIndex = $self->{'factorIndices'}->{$factorName};	foreach my $sentence (@{$self->{'input'}})	{		$totalTokens += scalar(@$sentence);		foreach my $word (@$sentence)		{			if(!defined($self->{'phraseTables'}->{$factorName}->{$word->[$factorIndex]}))			{				$unknownTokens++;			}		}	}	$self->{'unknownCount'}->{$factorName} = $unknownTokens;	$self->{'tokenCount'}->{'input'} = $totalTokens;		return ($unknownTokens, $totalTokens);}#arguments: system name#return: (WER, PWER) for nouns and adjectives in given system wrt truth#throw if given system or truth is not set or if index of 'surf' or 'pos' hasn't been specifiedsub calcNounAdjWER_PWERDiff{	my ($self, $sysname) = @_;	#check in-memory cache first	if(exists $self->{'nnAdjWERPWER'}->{$sysname})	{		return @{$self->{'nnAdjWERPWER'}->{$sysname}};	}	warn "calcing NN/JJ PWER/WER\n";		$self->ensureFilenameDefined('truth');	$self->ensureFilenameDefined($sysname);	$self->ensureFactorPosDefined('surf');	$self->ensureFactorPosDefined('pos');	$self->loadSentences('truth', $self->{'truthFilename'});	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});	#find nouns and adjectives and score them	my ($werScore, $pwerScore) = (0, 0);	my $nnNadjTags = $self->getPOSTagList('nounAndAdj');	for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)	{		my @nnAdjEWords = $self->filterFactors($self->{'truth'}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);		my @nnAdjSWords = $self->filterFactors($self->{$sysname}->[$i], $self->{'factorIndices'}->{'pos'}, $nnNadjTags);		my ($sentWer, $tmp) = $self->sentenceWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});		$werScore += $sentWer;		($sentWer, $tmp) = $self->sentencePWER(\@nnAdjSWords, \@nnAdjEWords, $self->{'factorIndices'}->{'surf'});		$pwerScore += $sentWer;	}		#unhog memory	$self->releaseSentences('truth');	$self->releaseSentences($sysname);	$self->{'nnAdjWERPWER'}->{$sysname} = [$werScore / $self->{'tokenCount'}->{'truth'}, $pwerScore / $self->{'tokenCount'}->{'truth'}];	return @{$self->{'nnAdjWERPWER'}->{$sysname}};}#calculate detailed WER statistics and put them into $self#arguments: system name, factor name to consider (default 'surf', surface form)#return: overall surface WER for given system (w/o filtering)#throw if given system or truth is not set or if index of factor name hasn't been specifiedsub calcOverallWER{	my ($self, $sysname, $factorName) = (shift, shift, 'surf');	if(scalar(@_) > 0) {$factorName = shift;}	#check in-memory cache first	if(exists $self->{'sysoutWER'}->{$sysname}->{$factorName})	{		return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0];	}	warn "calcing WER\n";		$self->ensureFilenameDefined('truth');	$self->ensureFilenameDefined($sysname);	$self->ensureFactorPosDefined($factorName);	$self->loadSentences('truth', $self->{'truthFilename'});	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});		my ($wer, $swers, $indices) = $self->corpusWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName});	$self->{'sysoutWER'}->{$sysname}->{$factorName} = [$wer, $swers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence		#unhog memory	$self->releaseSentences('truth');	$self->releaseSentences($sysname);	return $self->{'sysoutWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};}#calculate detailed PWER statistics and put them into $self#arguments: system name, factor name to consider (default 'surf')#return: overall surface PWER for given system (w/o filtering)#throw if given system or truth is not set or if index of factor name hasn't been specifiedsub calcOverallPWER{	my ($self, $sysname, $factorName) = (shift, shift, 'surf');	if(scalar(@_) > 0) {$factorName = shift;}	#check in-memory cache first	if(exists $self->{'sysoutPWER'}->{$sysname}->{$factorName})	{		return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0];	}	warn "calcing PWER\n";		$self->ensureFilenameDefined('truth');	$self->ensureFilenameDefined($sysname);	$self->ensureFactorPosDefined($factorName);	$self->loadSentences('truth', $self->{'truthFilename'});	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});		my ($pwer, $spwers, $indices) = $self->corpusPWER($self->{$sysname}, $self->{'truth'}, $self->{'factorIndices'}->{$factorName});	$self->{'sysoutPWER'}->{$sysname}->{$factorName} = [$pwer, $spwers, $indices]; #total; arrayref of scores for individual sentences; arrayref of arrayrefs of offending words in each sentence		#unhog memory	$self->releaseSentences('truth');	$self->releaseSentences($sysname);	return $self->{'sysoutPWER'}->{$sysname}->{$factorName}->[0] / $self->{'tokenCount'}->{'truth'};}#arguments: system name, factor name to consider (default 'surf')#return: array of (BLEU score, n-gram precisions, brevity penalty)sub calcBLEU{	my ($self, $sysname, $factorName) = (shift, shift, 'surf');	if(scalar(@_) > 0) {$factorName = shift;}	#check in-memory cache first	if(exists $self->{'bleuScores'}->{$sysname} && exists $self->{'bleuScores'}->{$sysname}->{$factorName})	{		return $self->{'bleuScores'}->{$sysname}->{$factorName};	}	warn "calcing BLEU\n";		$self->ensureFilenameDefined('truth');	$self->ensureFilenameDefined($sysname);	$self->ensureFactorPosDefined($factorName);	$self->loadSentences('truth', $self->{'truthFilename'});	$self->loadSentences($sysname, $self->{'sysoutFilenames'}->{$sysname});		#score structure: various total scores, arrayref of by-sentence score arrays	if(!exists $self->{'bleuScores'}->{$sysname}) {$self->{'bleuScores'}->{$sysname} = {};}	if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName}) {$self->{'bleuScores'}->{$sysname}->{$factorName} = [[], []];}		my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $totCLength, $totRLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);	my $factorIndex = $self->{'factorIndices'}->{$factorName};	for(my $i = 0; $i < scalar(@{$self->{'truth'}}); $i++)	{		my ($truthSentence, $sysoutSentence) = ($self->{'truth'}->[$i], $self->{$sysname}->[$i]);		my ($unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength) = 				$self->sentenceBLEU($truthSentence, $sysoutSentence, $factorIndex, 0); #last argument is whether to debug-print		push @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]}, [$unigood, $unicount, $bigood, $bicount, $trigood, $tricount, $quadrugood, $quadrucount, $cLength, $rLength];		$good1 += $unigood; $tot1 += $unicount;		$good2 += $bigood; $tot2 += $bicount;		$good3 += $trigood; $tot3 += $tricount;		$good4 += $quadrugood; $tot4 += $quadrucount;		$totCLength += $cLength;		$totRLength += $rLength;	}	my $brevity = ($totCLength > $totRLength || $totCLength == 0) ? 1 : exp(1 - $totRLength / $totCLength);	my ($pct1, $pct2, $pct3, $pct4) = ($tot1 == 0 ? -1 : $good1 / $tot1, $tot2 == 0 ? -1 : $good2 / $tot2, 													$tot3 == 0 ? -1 : $good3 / $tot3, $tot4 == 0 ? -1 : $good4 / $tot4);	my ($logsum, $logcount) = (0, 0);	if($tot1 > 0) {$logsum += my_log($pct1); $logcount++;}	if($tot2 > 0) {$logsum += my_log($pct2); $logcount++;}	if($tot3 > 0) {$logsum += my_log($pct3); $logcount++;}	if($tot4 > 0) {$logsum += my_log($pct4); $logcount++;}	my $bleu = $brevity * exp($logsum / $logcount);	$self->{'bleuScores'}->{$sysname}->{$factorName}->[0] = [$bleu, 100 * $pct1, 100 * $pct2, 100 * $pct3, 100 * $pct4, $brevity];  	#unhog memory	$self->releaseSentences('truth');	$self->releaseSentences($sysname);	return @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[0]};}#do t-tests on the whole-corpus n-gram precisions vs. the average precisions over a set number of disjoint subsets#arguments: system name, factor name BLEU was run on (default 'surf')#return: arrayref of [arrayref of p-values for overall precision vs. subset average, arrayrefs of [(lower, upper) 95% credible intervals for true overall n-gram precisions]]##written to try to save memorysub statisticallyTestBLEUResults{	my ($self, $sysname, $factorName) = (shift, shift, 'surf');	if(scalar(@_) > 0) {$factorName = shift;}	#check in-memory cache first	if(exists $self->{'bleuConfidence'}->{$sysname} && exists $self->{'bleuConfidence'}->{$sysname}->{$factorName})	{		return $self->{'bleuConfidence'}->{$sysname}->{$factorName};	}	warn "performing consistency tests\n";		my $k = 30; #HARDCODED NUMBER OF SUBSETS (WE DO k-FOLD CROSS-VALIDATION); IF YOU CHANGE THIS YOU MUST ALSO CHANGE getApproxPValue() and $criticalTStat	my $criticalTStat = 2.045; #hardcoded value given alpha (.025 here) and degrees of freedom (= $k - 1) ########################################	$self->ensureFilenameDefined('truth');	$self->ensureFilenameDefined($sysname);	$self->ensureFactorPosDefined($factorName);		#ensure we have full-corpus BLEU results	if(!exists $self->{'bleuScores'}->{$sysname}->{$factorName})	{		$self->calcBLEU($sysname, $factorName);	}	if(!exists $self->{'subsetBLEUstats'}->{$sysname}) {$self->{'subsetBLEUstats'}->{$sysname} = {};}	if(!exists $self->{'subsetBLEUstats'}->{$sysname}->{$factorName}) {$self->{'subsetBLEUstats'}->{$sysname}->{$factorName} = [];}		#calculate n-gram precisions for each small subset	my @sentenceStats = @{$self->{'bleuScores'}->{$sysname}->{$factorName}->[1]};	for(my $i = 0; $i < $k; $i++)	{		my ($good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength) = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0);		for(my $j = $i; $j < scalar(@sentenceStats); $j += $k) #subset #K consists of every Kth sentence		{			$good1 += $sentenceStats[$j]->[0]; $tot1 += $sentenceStats[$j]->[1];			$good2 += $sentenceStats[$j]->[2]; $tot2 += $sentenceStats[$j]->[3];			$good3 += $sentenceStats[$j]->[4]; $tot3 += $sentenceStats[$j]->[5];			$good4 += $sentenceStats[$j]->[6]; $tot4 += $sentenceStats[$j]->[7];			$sysoutLength += $sentenceStats[$j]->[8];			$truthLength += $sentenceStats[$j]->[9];		}		push @{$self->{'subsetBLEUstats'}->{$sysname}->{$factorName}}, [$good1, $tot1, $good2, $tot2, $good3, $tot3, $good4, $tot4, $sysoutLength, $truthLength];	}	my $subsetStats = $self->{'subsetBLEUstats'}->{$sysname}->{$factorName};

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -