⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 sentence-by-sentence.pl.svn-base

📁 解码器是基于短语的统计机器翻译系统的核心模块
💻 SVN-BASE
📖 第 1 页 / 共 2 页
字号:
#!/usr/bin/perl -w#sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors#usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.htmluse strict;use Getopt::Long;my $sourcefile = undef;my @truthfiles;GetOptions(	"source|s=s" => \$sourcefile,	"reference|r=s" => \@truthfiles) or exit(1);my @sysoutfiles = @ARGV;if (scalar(@sysoutfiles) == 0 || scalar(@truthfiles) == 0){	print STDERR "usage: $0 system_output(s) > sentence-by-sentence.htmlOptions:  --source,-s STRING      foreign input (can be used multiple times)  --reference,-r STRING   English truth (can be used multiple times)N-grams are colored by the number of supporting references: red for fewest, green for most, mediate shades otherwise.\n";  exit(1);}####################################################################################################################my @TRUTHS = () x scalar(@truthfiles);for(my $i = 0; $i < scalar(@truthfiles); $i++){	open($TRUTHS[$i], "<$truthfiles[$i]") or die "couldn't open '$truthfiles[$i]' for read: $!\n";	binmode($TRUTHS[$i], ":utf8");}my @SYSOUTS = () x scalar(@sysoutfiles);for(my $i = 0; $i < scalar(@sysoutfiles); $i++){	open($SYSOUTS[$i], "<$sysoutfiles[$i]") or die "couldn't open '$sysoutfiles[$i]' for read: $!\n";	binmode($SYSOUTS[$i], ":utf8");}binmode(STDOUT, ":utf8");if (defined $sourcefile){	open(SOURCE, "<$sourcefile") or die "couldn't open '$sourcefile' for read: $!\n";	binmode(SOURCE, ":utf8");}my @bleuScores;for(my $i = 0; $i < scalar(@sysoutfiles); $i++) {push @bleuScores, [];}my @htmlSentences;my @javascripts;my @htmlColors = ('#99ff99', '#aaaaff', '#ffff99', '#ff9933', '#ff9999'); #color sentences by rank (split in n tiers)my $ngramSingleRefColor = '#aaffaa';my @ngramMultirefColors = ('#ff9999', '#ff9933', '#ffff99', '#a0a0ff', '#99ff99'); #arbitrary-length list; first entry is used for worst n-gramsmy $numSentences = 0;my (@sLines, @eLines);while(readLines(\@SYSOUTS, \@sLines) && readLines(\@TRUTHS, \@eLines)){	#create array of lines of HTML	my @html = ("<div class=\"sentence_%%%%\" id=\"sentence$numSentences\">"); #%%%% is a flag to be replaced	my (@sFactors, @eFactors, $sourceFactors);	#process source	if (defined $sourcefile)	{		my $sourceLine = <SOURCE>;		escapeMetachars($sourceLine); #remove inconsistencies in encoding		$sourceFactors = extractFactorArrays($sourceLine);		push @html, "<tr><td class=\"sent_title\">Source</td><td class=\"source_sentence\" id=\"source$numSentences\">" 								. getFactoredSentenceHTML($sourceFactors) . "</td></tr>\n";	}	#process truth	for(my $j = 0; $j < scalar(@truthfiles); $j++)	{		escapeMetachars($eLines[$j]); #remove inconsistencies in encoding		push @eFactors, extractFactorArrays($eLines[$j]);		push @html, "<tr><td class=\"sent_title\">Ref $j</td><td class=\"truth_sentence\" id=\"truth${numSentences}_$j\">" 								. getFactoredSentenceHTML($eFactors[$j]) . "</td></tr>\n";	}	#process sysouts	my @bleuData;	for(my $j = 0; $j < scalar(@sysoutfiles); $j++)	{		escapeMetachars($sLines[$j]); #remove inconsistencies in encoding		push @sFactors, extractFactorArrays($sLines[$j]);		push @bleuData, getBLEUSentenceDetails($sFactors[$j], \@eFactors, 0);		push @{$bleuScores[$j]}, [$numSentences, $bleuData[$j]->[0], 0]; #the last number will be the rank		my $pwerData = getPWERSentenceDetails($sFactors[$j], \@eFactors, 0);		push @html, "<tr><td class=\"sent_title\">Output $j</td><td class=\"sysout_sentence\" id=\"sysout$numSentences\">" 								. getFactoredSentenceHTML($sFactors[$j], $pwerData) . "</td></tr>\n";		push @html, "<tr><td class=\"sent_title\">N-grams</td><td class=\"sysout_ngrams\" id=\"ngrams$numSentences\">" 								. getAllNgramsHTML($sFactors[$j], $bleuData[$j]->[1], scalar(@truthfiles)) . "</td></tr>\n";	}	splice(@html, 1, 0, "<div class=\"bleu_report\"><b>Sentence $numSentences)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;BLEU:</b> " 							. join("; ", map {sprintf("%.4lg", $_->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$_->[0]}[1 .. 4]) . ") "} @bleuData) . "</div><table>\n");	push @html, "</table></div>\n";	push @htmlSentences, join('', @html);	$numSentences++;	@sLines = (); @eLines = (); #clear writable arrays to be refilled}foreach my $sysoutfh (@SYSOUTS) {close($sysoutfh);}foreach my $truthfh (@TRUTHS) {close($truthfh);}my $stylesheet = "<style type=\"text/css\">.legend {background: #fff; border: 1px solid #000; padding: 2px; margin-bottom: 10px; margin-right: 15px}.legend_title {font-weight: bold; font-size: medium; text-decoration: underline}div.bleu_report {margin-bottom: 5px}td.sent_title {font-weight: bold; font-size: medium; margin-bottom: 12px}.source_sentence {background: #ffcccc; border: 1px solid #bbb}.truth_sentence {background: #ccffcc; border: 1px solid #bbb}.sysout_sentence {background: #ccccff; border: 1px solid #bbb}table.sentence_table {border: none}.sysout_ngrams {background: #fff; border: 1px solid #bbb}table.ngram_table {}td.ngram_cell {padding: 1px}\n";for(my $i = 0; $i < scalar(@htmlColors); $i++){	$stylesheet .= ".sentence_tier$i {background: $htmlColors[$i]; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition for a given sentence\n";	$stylesheet .= "div.sentence_tier$i td {margin: 8px 0px 8px 0px}\n";}$stylesheet .= "</style>\n";print "<html><head><meta http-equiv=\"Content-type: text/html; charset=utf-8\">\n";print "<title>[" . join(', ', @sysoutfiles) . "] vs. [" . join(', ', @truthfiles) . "]: Sentence-by-Sentence Comparison</title>$stylesheet</head><body>\n";foreach my $systemScores (@bleuScores) {rankSentencesByBLEU($systemScores);}#javascript to sort by BLEU for any system, by order in corpus ...print "<script type=\"text/javascript\">var selectedSysout = 0; //index of system currently being used to rank/sortfunction selectSysout(index){	//update the BLEU-range text shown in the legend	var legend = document.getElementById('legendBLEU');	var rows = legend.getElementsByTagName('tr');	for(var i = 0; i < rows.length; i++)	{		var cell = rows[i].childNodes[1];		var spans = cell.getElementsByTagName('span');		cell.childNodes[0].nodeValue = spans[index].firstChild.nodeValue; //something like '0.1 - 0.3'	}		//update the background colors of the sentence divs	var allSentences = document.getElementById('all_sentences');	var sentences = allSentences.childNodes;	for(var i = 0; i < sentences.length; i++)	{		if(typeof sentences[i].tagName != 'undefined' && sentences[i].tagName.toLowerCase() == 'div') //text nodes have undefined tagName		{			var tierSpans = sentences[i].firstChild.childNodes;			sentences[i].childNodes[2].className = tierSpans[index].firstChild.nodeValue; //something like 'tier3'		}	}	selectedSysout = index; //selectedSysout is a flag to the sort functions}function sortByBLEU(){	var body = document.getElementById('all_sentences'); var row;	switch(selectedSysout)	{\n";for(my $i = 0; $i < scalar(@sysoutfiles); $i++){	print "case $i:	{";	my %rank2index = map {$bleuScores[$i]->[$_]->[2] => $_} (0 .. scalar(@htmlSentences) - 1);	foreach my $rank (sort {$a <=> $b} keys %rank2index)	{		print "\trow = document.getElementById('everything" . $rank2index{$rank} . "');\n";		print "\tbody.removeChild(row); body.appendChild(row);\n";	}	print "break;}\n";}print "}}function sortByCorpusOrder(){	var body = document.getElementById('all_sentences'); var row;\n";for(my $j = 0; $j < scalar(@htmlSentences); $j++){	print "\trow = document.getElementById('everything$j');\n";	print "\tbody.removeChild(row); body.appendChild(row);\n";}print "}</script>\n";#legends for background colors of sentences and n-gramsmy (@minBLEU, @maxBLEU);my @bleuTiers = () x scalar(@htmlSentences); #for each sentence, arrayref of tier indices for each systemfor(my $i = 0; $i < scalar(@sysoutfiles); $i++){	my @a = (1e9) x scalar(@htmlColors);	my @b = (-1e9) x scalar(@htmlColors);	for(my $k = 0; $k < scalar(@htmlSentences); $k++)	{		my $tier = int($bleuScores[$i]->[$k]->[2] / (scalar(@htmlSentences) / scalar(@htmlColors)));		push @{$bleuTiers[$k]}, $tier;		if($bleuScores[$i]->[$k]->[1]->[0] < $a[$tier]) {$a[$tier] = $bleuScores[$i]->[$k]->[1]->[0];}		if($bleuScores[$i]->[$k]->[1]->[0] > $b[$tier]) {$b[$tier] = $bleuScores[$i]->[$k]->[1]->[0];}	}	push @minBLEU, \@a;	push @maxBLEU, \@b;}print "<table border=0><tr><td><div id=\"legendBLEU\" class=\"legend\"><span class=\"legend_title\">Sentence Background Colors => BLEU Ranges</span><table border=0>";for(my $k = 0; $k < scalar(@htmlColors); $k++){	print "<tr><td style=\"width: 15px; height: 15px; background: " . $htmlColors[$k] . "\"></td><td align=left style=\"padding-left: 12px\">" 							. sprintf("%.4lg", $minBLEU[0]->[$k]) . " - " . sprintf("%.4lg", $maxBLEU[0]->[$k]);	for(my $j = 0; $j < scalar(@sysoutfiles); $j++)	{		print "<span style=\"display: none\">" . sprintf("%.4lg", $minBLEU[$j]->[$k]) . " - " . sprintf("%.4lg", $maxBLEU[$j]->[$k]) . "</span>";	}	print "</td></tr>";}print "</table></div></td>\n";print "<td><div class=\"legend\"><span class=\"legend_title\">N-gram Colors => Number of Matching Reference Translations</span><table border=0>";for(my $k = 1; $k <= scalar(@truthfiles); $k++){	print "<tr><td style=\"width: 15px; height: 15px; background: " . getNgramColorHTML($k, scalar(@truthfiles)) . "\"></td><td align=left style=\"padding-left: 12px\">$k</td></tr>";}print "</table></div></td></tr></table><div style=\"font-weight: bold; margin-bottom: 15px\">PWER errors are marked in red on output sentence displays.</div><div style=\"margin-bottom: 8px\">Color by system # " 						. join(' | ', map {"<a href=\"javascript:selectSysout($_);\">$_</a>" . (($_ == '0') ? " (default)" : "")} (0 .. scalar(@sysoutfiles) - 1)) . "</div><div style=\"margin-bottom: 8px\">Sort by <a href=\"javascript:sortByBLEU();\">BLEU score</a> | <a href=\"javascript:sortByCorpusOrder();\">corpus order</a> (default)</div>\n";#sentence boxesprint "<div id=\"all_sentences\">";for(my $j = 0; $j < scalar(@htmlSentences); $j++){	print "<div id=\"everything$j\" style=\"margin: 0px; padding: 0px\">";	print "<div class=\"ranks_container\" style=\"display: none\">" . join('', map {"<span>sentence_tier$_</span>"} @{$bleuTiers[$j]}) . "</div>";	print "<hr width=98%>";#	my $bgcolor = getSentenceBGColorHTML($bleuScores[0]->[$j], $i); #i is now # of sentences	my $tierNum = $bleuTiers[$j]->[0];	$htmlSentences[$j] =~ s/%%%%/tier$tierNum/;	print "$htmlSentences[$j]</div>\n";}print "</div></body></html>";##################### utils ######################arguments: a, b (scalars)sub min{	my ($a, $b) = @_;	return ($a < $b) ? $a : $b;}#arguments: a, b (scalars)sub max{	my ($a, $b) = @_;	return ($a > $b) ? $a : $b;}#arguments: a list of elements#return undef for an empty list, the max element otherwisesub maxN{	if(scalar @_ == 0) {return undef;}	my $val = shift @_;	foreach my $e (@_) {if($e > $val) {$val = $e;}}	return $val;}#arguments: xsub my_log{  return -9999999999 unless $_[0];  return log($_[0]);}#arguments: xsub round

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -