corpus.pm.svn-base

来自「解码器是基于短语的统计机器翻译系统的核心模块」· SVN-BASE 代码 · 共 1,343 行 · 第 1/4 页
SVN-BASE
1,343 行
		push @$sentenceWER, $sentWER;		push @$errIndices, $indices;	}	return ($totWER, $sentenceWER, $errIndices);}#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use#return: wer score, arrayref of arrayrefs of indices of errorful wordssub sentenceWER{	#constants: direction we came through the table	my ($DIR_NONE, $DIR_SKIPTRUTH, $DIR_SKIPOUT, $DIR_SKIPBOTH) = (-1, 0, 1, 2); #values don't matter but must be unique	my ($self, $refSysOutput, $refTruth, $index) = @_;	my ($totWER, $indices) = (0, []);	my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth));	if($sLength == 0 || $eLength == 0) {return ($totWER, $indices);} #special case		my @refWordsMatchIndices = (-1) x $eLength; #at what sysout-word index this truth word is first matched	my @sysoutWordsMatchIndices = (-1) x $sLength; #at what truth-word index this sysout word is first matched	my $table = []; #index by sysout word index, then truth word index; a cell holds max count of matching words and direction we came to get it	#dynamic-programming time: find the path through the table with the maximum number of matching words	for(my $i = 0; $i < $sLength; $i++)	{		push @$table, [];		for(my $j = 0; $j < $eLength; $j++)		{			my ($maxPrev, $prevDir) = (0, $DIR_NONE);			if($i > 0 && $table->[$i - 1]->[$j]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j]->[0]; $prevDir = $DIR_SKIPOUT;}			if($j > 0 && $table->[$i]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i]->[$j - 1]->[0]; $prevDir = $DIR_SKIPTRUTH;}			if($i > 0 && $j > 0 && $table->[$i - 1]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j - 1]->[0]; $prevDir = $DIR_SKIPBOTH;}			my $match = ($refSysOutput->[$i]->[$index] eq $refTruth->[$j]->[$index] && $refWordsMatchIndices[$j] == -1 && $sysoutWordsMatchIndices[$i] == -1) ? 1 : 0;			if($match == 1) {$refWordsMatchIndices[$j] = $i; $sysoutWordsMatchIndices[$i] = $j;}			push @{$table->[$i]}, [($match ? $maxPrev + 1 : $maxPrev), $prevDir];		}	}		#look back along the path and get indices of non-matching words	my @unusedSysout = (0) x $sLength; #whether each sysout word was matched--used for outputting html table	my ($i, $j) = ($sLength - 1, $eLength - 1);	while($i > 0) #work our way back to the first sysout word	{		push @{$table->[$i]->[$j]}, 0; #length is flag to highlight cell		if($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH)		{			$j--;		}		elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT)		{			if($table->[$i - 1]->[$j]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;}			$i--;		}		elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH)		{			if($table->[$i - 1]->[$j - 1]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;}			$i--; $j--;		}	}	#we're at the first sysout word; finish up checking for matches	while($j > 0 && $refWordsMatchIndices[$j] != 0) {push @{$table->[0]->[$j]}, 0; $j--;}	if($j == 0 && $refWordsMatchIndices[0] != 0) {unshift @$indices, 0; $unusedSysout[0] = 1;} #no truth word was matched to the first sysout word		#print some HTML to debug the WER algorithm#	print "<table border=1><tr><td></td><td>" . join("</td><td>", map {() . $_->[$index]} @$refTruth) . "</td></tr>";#	for(my $i = 0; $i < $sLength; $i++)#	{#		print "<tr><td" . (($unusedSysout[$i] == 1) ? " style=\"background-color: #ffdd88\">" : ">") . $refSysOutput->[$i]->[$index] . "</td>";#		for(my $j = 0; $j < $eLength; $j++)#		{#			print "<td";#			if(scalar(@{$table->[$i]->[$j]}) > 2) {print " style=\"color: yellow; background-color: #000080\"";}#			my $arrow;#			if($table->[$i]->[$j]->[1] == $DIR_NONE) {$arrow = "&times;";}#			elsif($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH) {$arrow = "&larr;";}#			elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT) {$arrow = "&uarr;";}#			elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH) {$arrow = "&loz;";}#			print ">" . $table->[$i]->[$j]->[0] . "  " . $arrow . "</td>";#		}#		print "</tr>";#	}#	print "</table>";		my $matchCount = 0;	if($sLength > 0) {$matchCount = $table->[$sLength - 1]->[$eLength - 1]->[0];}	return ($sLength - $matchCount, $indices);}#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful wordssub corpusPWER{	my ($self, $refSysOutput, $refTruth, $index) = @_;	my ($totWER, $sentenceWER, $errIndices) = (0, [], []);	for(my $i = 0; $i < scalar(@$refSysOutput); $i++)	{		my ($sentWER, $indices) = $self->sentencePWER($refSysOutput->[$i], $refTruth->[$i], $index);		$totWER += $sentWER;		push @$sentenceWER, $sentWER;		push @$errIndices, $indices;	}	return ($totWER, $sentenceWER, $errIndices);}#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use#return: wer score, arrayref of arrayrefs of indices of errorful wordssub sentencePWER{	my ($self, $refSysOutput, $refTruth, $index) = @_;	my ($totWER, $indices) = (0, []);	my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth));	my @truthWordUsed = (0) x $eLength; #array of 0/1; can only match a given truth word once	for(my $j = 0; $j < $sLength; $j++)	{		my $found = 0;		for(my $k = 0; $k < $eLength; $k++) #check output word against entire truth sentence		{			if(lc $refSysOutput->[$j]->[$index] eq lc $refTruth->[$k]->[$index] && $truthWordUsed[$k] == 0)			{				$truthWordUsed[$k] = 1;				$found = 1;				last;			}		}		if($found == 0)		{			$totWER++;			push @$indices, $j;		}	}	return ($totWER, $indices);}#BLEU calculation for a single sentence#arguments: truth sentence (arrayref of arrayrefs of factor strings), sysout sentence (same), factor index to use#return: 1- through 4-gram matching and total counts (1-g match, 1-g tot, 2-g match...), candidate length, reference lengthsub sentenceBLEU{	my ($self, $refTruth, $refSysOutput, $factorIndex, $debug) = @_;	my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput));	my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0);	my %REF_GRAM = ();	my ($i, $gram);	for($i = 0; $i < $length_reference; $i++)	{		$gram = $refTruth->[$i]->[$factorIndex];		$REF_GRAM{$gram}++;		next if $i<1;		$gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram;		$REF_GRAM{$gram}++;      next if $i<2;      $gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram;      $REF_GRAM{$gram}++;      next if $i<3;      $gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram;      $REF_GRAM{$gram}++;	}	for($i = 0; $i < $length_translation; $i++)	{      $gram = $refSysOutput->[$i]->[$factorIndex];      if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {			$REF_GRAM{$gram}--;			$correct1++;      }      next if $i<1;      $gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram;      if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {			$REF_GRAM{$gram}--;			$correct2++;      }      next if $i<2;      $gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram;      if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {			$REF_GRAM{$gram}--;			$correct3++;      }      next if $i<3;      $gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram;      if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {			$REF_GRAM{$gram}--;			$correct4++;      }	}	my $total = $length_translation;	$total1 = max(1, $total);	$total2 = max(1, $total - 1);	$total3 = max(1, $total - 2);	$total4 = max(1, $total - 3);		return ($correct1, $total1, $correct2, $total2, $correct3, $total3, $correct4, $total4, $length_translation, $length_reference);}##### filesystem ######open as many given files as possible; only warn about the rest#arguments: list of filename extensions to open (assume corpus name is file title)#return: hash from type string to filehandleref, giving all files that were successfully openedsub openFiles{	my ($self, @extensions) = @_;	my %openedFiles = ();	foreach my $ext (@extensions)	{		if(!open(FILE, "<" . $self->{'corpusName'} . $ext))		{			warn "Corpus::openFiles(): couldn't open '" . $self->{'corpusName'} . $ext . "' for read\n";		}		else #success		{			$openedFiles{$ext} = \*FILE;		}	}	return %openedFiles;}#read one line from each given file#arguments: hash from type string to filehandleref#return: hash from type string to sentence (stored as arrayref of arrayrefs of factors) read from corresponding filesub readLineFromFiles{	my ($self, %openedFiles) = @_;	my %lines;	foreach my $type (keys %openedFiles)	{		$lines{$type} = [];		my $sentence = <$openedFiles{$type}>;		my @words = split(/\s+/, $sentence);		foreach my $word (@words)		{			my @factors = split(/\|/, $word);			push @{$lines{$type}}, \@factors;		}	}	return %lines;}#close all given files#arguments: hash from type string to filehandleref#return: nonesub closeFiles{	my ($self, %openedFiles) = @_;	foreach my $type (keys %openedFiles)	{		close($openedFiles{$type});	}}##### write HTML ######print HTML for comparing various versions of a sentence, with special processing for each version as appropriate#arguments: filehandleref to which to write, sentence ID string, hashref of version string to sentence (stored as arrayref of arrayref of factor strings)#return: nonesub printSingleSentenceComparison{	my ($self, $fh, $sentID, $sentences) = @_;	my $curFH = select;	select $fh;	#javascript to reorder rows to look nice afterward	print "<script type=\"text/javascript\">	function reorder_$sentID()	{/*		var table = document.getElementById('div_$sentID').firstChild;		var refTransRow = table.getElementById('row_e');		var inputRow = table.getElementById('row_f');		table.removeRow(refTransRow);		table.removeRow(inputRow);		var newRow1 = table.insertRow(0);		var newRow2 = table.insertRow(1);		newRow1.childNodes = inputRow.childNodes;		newRow2.childNodes = refTransRow.childNodes;*/	}	</script>";	#html for sentences	print "<div id=\"div_$sentID\" style=\"padding: 3px; margin: 5px\">";	print "<table border=\"1\">";#	my $rowCount = 0;#	my @bgColors = ("#ffefbf", "#ffdf7f");	#process all rows in order	foreach my $sentType (keys %$sentences)	{		my $bgcolor = $bgColors[$rowCount % 2];		print "<tr id=\"row_$sentType\"><td align=right>";		#description of sentence		if(defined($self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType}))		{			print "(" . $self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType} . ")";		}		else		{			print "($sentType)";		}		print "</td><td align=left>";		#sentence with markup		if($sentType eq 'f') #input		{#			$self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $inputColor);		}		elsif($sentType eq 'e') #reference translation		{#			$self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $reftransColor);		}		else #system output		{#			$self->writeHTMLTranslationHighlightedWithFactors($fh, $sentences->{$sentType}, $sentences->{'e'}, $highlightColors);		}		print "</td></tr>";#		$rowCount++;	}	print "</table>";	print "</div>\n";	select $curFH;}#print contents of all fields of this object, with useful formatting for arrayrefs and hashrefs#arguments: none#return: nonesub printDetails{	my $self = shift;	foreach my $key (keys %$self)	{		if(ref($self->{$key}) eq 'HASH')		{			print STDERR "obj: $key => {" . join(', ', map {"$_ => " . $self->{$key}->{$_}} (keys %{$self->{$key}})) . "}\n";		}		elsif(ref($self->{$key}) eq 'ARRAY')		{			print STDERR "obj: $key => (" . join(', ', @{$self->{$key}}) . ")\n";		}		elsif(ref($self->{$key}) eq '') #not a reference		{			print STDERR "obj: $key => " . $self->{$key} . "\n";		}	}}
corpus.pm.svn-base - 源码说明

本页面展示了「解码器是基于短语的统计机器翻译系统的核心模块」中的 corpus.pm.svn-base 源码文件，采用 SVN-BASE 编程语言编写，共 1,343 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与解码器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?