📄 sentence-by-sentence.pl.svn-base

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 SVN-BASE
📖 第 1 页 / 共 2 页
字号:
上一页 12
{	my $x = shift;	return ($x - int($x) < .5) ? int($x) : int($x) + 1;}#escape HTML metacharacters for display purposes and to allow for consistent string comparison#arguments: string to be formatted in place#return: nonesub escapeMetachars{	my $str = shift;	$str =~ s/&\s+/&amp; /;	$str =~ s/<\s+/&lt; /;	$str =~ s/>\s+/&gt; /;}################################################################################################################################################################read one line from each of any number of filehandles#arguments: arrayref of filehandles, (empty) arrayref to be filled with read lines#return: 1 on success, 0 on failure (on failure the lines arrayref's value isn't defined)sub readLines{	my ($refFilehandles, $refLines) = @_;	foreach my $fh (@$refFilehandles)	{		my $line = <$fh>;		return 0 unless defined($line);		push @$refLines, $line;	}	return 1;}#arguments: line read from corpus file#return: sentence (arrayref of arrayrefs of factor strings) taken from linesub extractFactorArrays{	my $line = shift;	die "" if !defined $line;	chomp $line;	$line =~ s/^\s*|\s*$//g; #added by Ondrej to handle moses-mert-parallel output	my @words = split(/\s+/, $line);	my @factors = map {my @f = split(/\|/, $_); \@f;} @words;	return \@factors;}#can handle multiple reference translations; assume at least one#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use#return: arrayref of [arrayref of [overall BLEU score, n-gram precisions], arrayref of matching n-gram [start index, length, arrayref of indices of matching references]]sub getBLEUSentenceDetails{	my $maxNgramOrder = 4;	my ($refSysOutput, $refTruths, $factorIndex) = @_;	my $length_translation = scalar(@$refSysOutput); #length of sysout sentence	my @length_references = map {scalar(@$_)} @$refTruths;	my $closestTruthLength = (sort(map {abs($_ - $length_translation)} @length_references))[0];	my @correct = (0) x $maxNgramOrder; #n-gram counts	my @total = (0) x $maxNgramOrder; #n-gram counts	my $returnData = [[], []];	my %REF_GRAM; #hash from n-gram to arrayref with # of times found in each reference	my $ngramMatches = []; #arrayref of n-gram [start index, length]	for(my $j = 0; $j < scalar(@$refTruths); $j++)	{		for(my $i = 0; $i < $length_references[$j]; $i++)		{			my $gram = '';			for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders			{				$gram = $refTruths->[$j]->[$i - $k]->[$factorIndex] . " " . $gram;				#increment the count for the given n-gram and given reference number				if(!exists $REF_GRAM{$gram})				{					my @tmp = (0) x scalar @$refTruths;					$tmp[$j] = 1;					$REF_GRAM{$gram} = \@tmp;				}				else				{					$REF_GRAM{$gram}->[$j]++;				}			}		}	}	for(my $i = 0; $i < $length_translation; $i++)	{		my $gram = '';		for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders		{			$gram = $refSysOutput->[$i - $k]->[$factorIndex] . " " . $gram;			if(exists $REF_GRAM{$gram}) #this n-gram was found in at least one reference			{				$correct[$k]++;				my @indices = ();				my $notOvercounting = 0; #make sure we don't 'match' against truth n-grams whose instances have all been used already				for(my $m = 0; $m < scalar(@{$REF_GRAM{$gram}}); $m++)				{					if($REF_GRAM{$gram}->[$m] > 0)					{						push @indices, $m;						$REF_GRAM{$gram}->[$m]--;						$notOvercounting = 1;					}				}				if($notOvercounting == 1) {push @$ngramMatches, [$i - $k, $k + 1, \@indices];}			}		}	}	my $brevity = ($length_translation > $closestTruthLength || $length_translation == 0) ? 1 : exp(1 - $closestTruthLength / $length_translation);	my @pct;	my ($logsum, $logcount) = (0, 0);	for(my $i = 0; $i < $maxNgramOrder; $i++)	{		$total[$i] = max(1, $length_translation - $i);		push @pct, ($total[$i] == 0) ? -1 : $correct[$i] / $total[$i];		if($total[$i] > 0)		{			$logsum += my_log($pct[$i]);			$logcount++;		}	}	my $bleu = $brevity * exp($logsum / $logcount);	$returnData->[0] = [$bleu, @pct];	$returnData->[1] = $ngramMatches;	return $returnData;}#can handle multiple sentence translations; assume at least one#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use#return: hashref of sysout word index => whether word matchessub getPWERSentenceDetails{	my ($refSysOutput, $refTruths, $factorIndex) = @_;	my $matches = {};	my %truthWords; #hash from word to arrayref with number of times seen in each reference (but later holds only the max)	for(my $i = 0; $i < scalar(@$refTruths); $i++)	{		foreach my $eWord (@{$refTruths->[$i]})		{			my $factor = $eWord->[$factorIndex];			if(exists $truthWords{$factor}) {$truthWords{$factor}->[$i]++;}			else {my @tmp = (0) x scalar(@$refTruths); $tmp[$i] = 1; $truthWords{$factor} = \@tmp;}		}	}	%truthWords = map {$_ => maxN(@{$truthWords{$_}})} (keys %truthWords); #save only the max times each word is seen in a reference	for(my $j = 0; $j < scalar(@$refSysOutput); $j++)	{		if(exists $truthWords{$refSysOutput->[$j]->[$factorIndex]} && $truthWords{$refSysOutput->[$j]->[$factorIndex]} > 0)		{			$truthWords{$refSysOutput->[$j]->[$factorIndex]}--;			$matches->{$j} = 1;		}		else		{			$matches->{$j} = 0;		}	}	return $matches;}#assign ranks to sentences by BLEU score#arguments: arrayref of arrayrefs of [sentence index, arrayref of [bleu score, n-gram precisions], rank to be assigned]#return: nonesub rankSentencesByBLEU{	my $bleuData = shift;	my $i = 0;	#sort first on score, then on 1-gram accuracy, then on sentence index	foreach my $sentenceData (reverse sort {my $c = $a->[1]->[0] <=> $b->[1]->[0]; if($c == 0) {my $d = $a->[1]->[1] <=> $b->[1]->[1]; if($d == 0) {$a->[0] cmp $b->[0];} else {$d;}} else {$c;}} @$bleuData) {$sentenceData->[2] = $i++;}}################################################################################################################################################################write HTML for a sentence containing factors (display words in a row)#arguments: sentence  (arrayref of arrayrefs of factor strings), PWER results (hashref from word indices to 0/1 whether matched a truth word)#return: HTML stringsub getFactoredSentenceHTML{	my $sentence = shift;	my $pwer = 0; if(scalar(@_) > 0) {$pwer = shift;}	my $html = "<table class=\"sentence_table\"><tr>";	for(my $i = 0; $i < scalar(@$sentence); $i++) #loop over words	{		my $style = ''; #default		if($pwer ne '0' && $pwer->{$i} == 0) {$style = 'color: #cc0000; font-weight: bold';}		$html .= "<td align=center style=\"$style\">" . join("<br>", @{$sentence->[$i]}) . "</td>";	}	return $html . "</tr></table>";}#arguments: arrayref of [sentence index, arrayref of [bleu score, n-gram precisions], rank], number of sentences#return: HTML color stringsub getSentenceBGColorHTML{	my ($scoreData, $numSentences) = @_;	my $tier = int($scoreData->[2] / ($numSentences / scalar(@htmlColors))); #0..n-1	return $htmlColors[$tier];}#display all matching n-grams in the given sentence, with all 1-grams on one line, then arranged by picking, for each, the first line on which it fits,# where a given word position can only be filled by one n-gram per line, so that all n-grams can be shown#arguments: sentence (arrayref of arrayrefs of factor strings), arrayref of arrayrefs of matching n-gram [start, length, arrayref of matching reference indices], # number of reference translations#return: HTML stringsub getAllNgramsHTML{	my ($sentence, $ngrams, $numTruths) = @_;	my $factorIndex = 0;	my @table = (); #array or arrayrefs each of which represents a line; each position has the index of the occupying n-gram, or -1 if none	my $n = 0; #n-gram index	foreach my $ngram (sort {$a->[0] <=> $b->[0]} @$ngrams)	{		#check for an open slot in an existing row		my $foundRow = 0;		my $r = 0;		foreach my $row (@table)		{			if(rowIsClear($row, $ngram) == 1)			{				@{$row}[$ngram->[0] .. ($ngram->[0] + $ngram->[1] - 1)] = ($n) x $ngram->[1];				push @$ngram, $r; #add row index				$foundRow = 1;				last;			}			$r++;		}		#add row if necessary		if($foundRow == 0)		{			my @row = (-1) x scalar(@$sentence);			@row[$ngram->[0] .. ($ngram->[0] + $ngram->[1] - 1)] = ($n) x $ngram->[1];			push @$ngram, scalar(@table); #add row index			push @table, \@row;		}		$n++;	}		my $html = "<table class=\"ngram_table\"><tr><td align=center>" . join("</td><td align=center>", map {$_->[$factorIndex]} @$sentence) . "</td></tr>";		my $numWords = scalar(@$sentence);	my ($curRow, $curCol) = (0, 0); #address in table	$html .= "<tr>";	foreach my $ngram (sort {my $c = $a->[3] <=> $b->[3]; if($c == 0) {$a->[0] <=> $b->[0]} else {$c}} @$ngrams) #sort by row, then word num	{		while($ngram->[0] > $curCol || $ngram->[3] > $curRow) {$html .= "<td></td>"; $curCol = ($curCol + 1) % $numWords; if($curCol == 0) {$html .= "</tr><tr>"; $curRow++;}}		$html .= "<td colspan=" . $ngram->[1] . " align=center class=\"ngram_cell\" style=\"background: " . getNgramColorHTML(scalar(@{$ngram->[2]}), $numTruths) . "\">" . join(' ', map {$_->[$factorIndex]} @{$sentence}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) . "</td>";		$curCol = ($curCol + $ngram->[1]) % $numWords; if($curCol == 0) {$html .= "</tr><tr>"; $curRow++;}	}	$html .= "</tr>";	return $html . "</table>\n";}#auxiliary to getAllNgramsHTML(): check a table row for an empty piece at the right place for the given n-gram#arguments: row (arrayref of ints), n-gram (arrayref of [start index, length])#return: whether (0/1) row is clearsub rowIsClear{	my ($row, $ngram) = @_;	return (maxN(@{$row}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) == -1) ? 1 : 0;}#auxiliary to getAllNgramsHTML()#arguments: number of reference translations matching the n-gram, total number of references#return: HTML color stringsub getNgramColorHTML{	my ($matches, $total) = @_;	if($total == 1) {return $ngramSingleRefColor;}	return $ngramMultirefColors[round($matches / $total * (scalar(@ngramMultirefColors) - 1))];}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -