📄 sentence-by-sentence.pl.svn-base
字号:
#!/usr/bin/perl -w#sentence-by-sentence: take in a system output, with any number of factors, and a reference translation, also maybe with factors, and show each sentence and its errors#usage: sentence-by-sentence SYSOUT [REFERENCE]+ > sentences.htmluse strict;use Getopt::Long;my $sourcefile = undef;my @truthfiles;GetOptions( "source|s=s" => \$sourcefile, "reference|r=s" => \@truthfiles) or exit(1);my @sysoutfiles = @ARGV;if (scalar(@sysoutfiles) == 0 || scalar(@truthfiles) == 0){ print STDERR "usage: $0 system_output(s) > sentence-by-sentence.htmlOptions: --source,-s STRING foreign input (can be used multiple times) --reference,-r STRING English truth (can be used multiple times)N-grams are colored by the number of supporting references: red for fewest, green for most, mediate shades otherwise.\n"; exit(1);}####################################################################################################################my @TRUTHS = () x scalar(@truthfiles);for(my $i = 0; $i < scalar(@truthfiles); $i++){ open($TRUTHS[$i], "<$truthfiles[$i]") or die "couldn't open '$truthfiles[$i]' for read: $!\n"; binmode($TRUTHS[$i], ":utf8");}my @SYSOUTS = () x scalar(@sysoutfiles);for(my $i = 0; $i < scalar(@sysoutfiles); $i++){ open($SYSOUTS[$i], "<$sysoutfiles[$i]") or die "couldn't open '$sysoutfiles[$i]' for read: $!\n"; binmode($SYSOUTS[$i], ":utf8");}binmode(STDOUT, ":utf8");if (defined $sourcefile){ open(SOURCE, "<$sourcefile") or die "couldn't open '$sourcefile' for read: $!\n"; binmode(SOURCE, ":utf8");}my @bleuScores;for(my $i = 0; $i < scalar(@sysoutfiles); $i++) {push @bleuScores, [];}my @htmlSentences;my @javascripts;my @htmlColors = ('#99ff99', '#aaaaff', '#ffff99', '#ff9933', '#ff9999'); #color sentences by rank (split in n tiers)my $ngramSingleRefColor = '#aaffaa';my @ngramMultirefColors = ('#ff9999', '#ff9933', '#ffff99', '#a0a0ff', '#99ff99'); #arbitrary-length list; first entry is used for worst n-gramsmy $numSentences = 0;my (@sLines, @eLines);while(readLines(\@SYSOUTS, \@sLines) && readLines(\@TRUTHS, \@eLines)){ #create array of lines of HTML my @html = ("<div class=\"sentence_%%%%\" id=\"sentence$numSentences\">"); #%%%% is a flag to be replaced my (@sFactors, @eFactors, $sourceFactors); #process source if (defined $sourcefile) { my $sourceLine = <SOURCE>; escapeMetachars($sourceLine); #remove inconsistencies in encoding $sourceFactors = extractFactorArrays($sourceLine); push @html, "<tr><td class=\"sent_title\">Source</td><td class=\"source_sentence\" id=\"source$numSentences\">" . getFactoredSentenceHTML($sourceFactors) . "</td></tr>\n"; } #process truth for(my $j = 0; $j < scalar(@truthfiles); $j++) { escapeMetachars($eLines[$j]); #remove inconsistencies in encoding push @eFactors, extractFactorArrays($eLines[$j]); push @html, "<tr><td class=\"sent_title\">Ref $j</td><td class=\"truth_sentence\" id=\"truth${numSentences}_$j\">" . getFactoredSentenceHTML($eFactors[$j]) . "</td></tr>\n"; } #process sysouts my @bleuData; for(my $j = 0; $j < scalar(@sysoutfiles); $j++) { escapeMetachars($sLines[$j]); #remove inconsistencies in encoding push @sFactors, extractFactorArrays($sLines[$j]); push @bleuData, getBLEUSentenceDetails($sFactors[$j], \@eFactors, 0); push @{$bleuScores[$j]}, [$numSentences, $bleuData[$j]->[0], 0]; #the last number will be the rank my $pwerData = getPWERSentenceDetails($sFactors[$j], \@eFactors, 0); push @html, "<tr><td class=\"sent_title\">Output $j</td><td class=\"sysout_sentence\" id=\"sysout$numSentences\">" . getFactoredSentenceHTML($sFactors[$j], $pwerData) . "</td></tr>\n"; push @html, "<tr><td class=\"sent_title\">N-grams</td><td class=\"sysout_ngrams\" id=\"ngrams$numSentences\">" . getAllNgramsHTML($sFactors[$j], $bleuData[$j]->[1], scalar(@truthfiles)) . "</td></tr>\n"; } splice(@html, 1, 0, "<div class=\"bleu_report\"><b>Sentence $numSentences) BLEU:</b> " . join("; ", map {sprintf("%.4lg", $_->[0]->[0]) . " (" . join('/', map {sprintf("%.4lg", $_)} @{$_->[0]}[1 .. 4]) . ") "} @bleuData) . "</div><table>\n"); push @html, "</table></div>\n"; push @htmlSentences, join('', @html); $numSentences++; @sLines = (); @eLines = (); #clear writable arrays to be refilled}foreach my $sysoutfh (@SYSOUTS) {close($sysoutfh);}foreach my $truthfh (@TRUTHS) {close($truthfh);}my $stylesheet = "<style type=\"text/css\">.legend {background: #fff; border: 1px solid #000; padding: 2px; margin-bottom: 10px; margin-right: 15px}.legend_title {font-weight: bold; font-size: medium; text-decoration: underline}div.bleu_report {margin-bottom: 5px}td.sent_title {font-weight: bold; font-size: medium; margin-bottom: 12px}.source_sentence {background: #ffcccc; border: 1px solid #bbb}.truth_sentence {background: #ccffcc; border: 1px solid #bbb}.sysout_sentence {background: #ccccff; border: 1px solid #bbb}table.sentence_table {border: none}.sysout_ngrams {background: #fff; border: 1px solid #bbb}table.ngram_table {}td.ngram_cell {padding: 1px}\n";for(my $i = 0; $i < scalar(@htmlColors); $i++){ $stylesheet .= ".sentence_tier$i {background: $htmlColors[$i]; border: 1px solid #000088; padding: 0px 8px 0px 8px} //entire composition for a given sentence\n"; $stylesheet .= "div.sentence_tier$i td {margin: 8px 0px 8px 0px}\n";}$stylesheet .= "</style>\n";print "<html><head><meta http-equiv=\"Content-type: text/html; charset=utf-8\">\n";print "<title>[" . join(', ', @sysoutfiles) . "] vs. [" . join(', ', @truthfiles) . "]: Sentence-by-Sentence Comparison</title>$stylesheet</head><body>\n";foreach my $systemScores (@bleuScores) {rankSentencesByBLEU($systemScores);}#javascript to sort by BLEU for any system, by order in corpus ...print "<script type=\"text/javascript\">var selectedSysout = 0; //index of system currently being used to rank/sortfunction selectSysout(index){ //update the BLEU-range text shown in the legend var legend = document.getElementById('legendBLEU'); var rows = legend.getElementsByTagName('tr'); for(var i = 0; i < rows.length; i++) { var cell = rows[i].childNodes[1]; var spans = cell.getElementsByTagName('span'); cell.childNodes[0].nodeValue = spans[index].firstChild.nodeValue; //something like '0.1 - 0.3' } //update the background colors of the sentence divs var allSentences = document.getElementById('all_sentences'); var sentences = allSentences.childNodes; for(var i = 0; i < sentences.length; i++) { if(typeof sentences[i].tagName != 'undefined' && sentences[i].tagName.toLowerCase() == 'div') //text nodes have undefined tagName { var tierSpans = sentences[i].firstChild.childNodes; sentences[i].childNodes[2].className = tierSpans[index].firstChild.nodeValue; //something like 'tier3' } } selectedSysout = index; //selectedSysout is a flag to the sort functions}function sortByBLEU(){ var body = document.getElementById('all_sentences'); var row; switch(selectedSysout) {\n";for(my $i = 0; $i < scalar(@sysoutfiles); $i++){ print "case $i: {"; my %rank2index = map {$bleuScores[$i]->[$_]->[2] => $_} (0 .. scalar(@htmlSentences) - 1); foreach my $rank (sort {$a <=> $b} keys %rank2index) { print "\trow = document.getElementById('everything" . $rank2index{$rank} . "');\n"; print "\tbody.removeChild(row); body.appendChild(row);\n"; } print "break;}\n";}print "}}function sortByCorpusOrder(){ var body = document.getElementById('all_sentences'); var row;\n";for(my $j = 0; $j < scalar(@htmlSentences); $j++){ print "\trow = document.getElementById('everything$j');\n"; print "\tbody.removeChild(row); body.appendChild(row);\n";}print "}</script>\n";#legends for background colors of sentences and n-gramsmy (@minBLEU, @maxBLEU);my @bleuTiers = () x scalar(@htmlSentences); #for each sentence, arrayref of tier indices for each systemfor(my $i = 0; $i < scalar(@sysoutfiles); $i++){ my @a = (1e9) x scalar(@htmlColors); my @b = (-1e9) x scalar(@htmlColors); for(my $k = 0; $k < scalar(@htmlSentences); $k++) { my $tier = int($bleuScores[$i]->[$k]->[2] / (scalar(@htmlSentences) / scalar(@htmlColors))); push @{$bleuTiers[$k]}, $tier; if($bleuScores[$i]->[$k]->[1]->[0] < $a[$tier]) {$a[$tier] = $bleuScores[$i]->[$k]->[1]->[0];} if($bleuScores[$i]->[$k]->[1]->[0] > $b[$tier]) {$b[$tier] = $bleuScores[$i]->[$k]->[1]->[0];} } push @minBLEU, \@a; push @maxBLEU, \@b;}print "<table border=0><tr><td><div id=\"legendBLEU\" class=\"legend\"><span class=\"legend_title\">Sentence Background Colors => BLEU Ranges</span><table border=0>";for(my $k = 0; $k < scalar(@htmlColors); $k++){ print "<tr><td style=\"width: 15px; height: 15px; background: " . $htmlColors[$k] . "\"></td><td align=left style=\"padding-left: 12px\">" . sprintf("%.4lg", $minBLEU[0]->[$k]) . " - " . sprintf("%.4lg", $maxBLEU[0]->[$k]); for(my $j = 0; $j < scalar(@sysoutfiles); $j++) { print "<span style=\"display: none\">" . sprintf("%.4lg", $minBLEU[$j]->[$k]) . " - " . sprintf("%.4lg", $maxBLEU[$j]->[$k]) . "</span>"; } print "</td></tr>";}print "</table></div></td>\n";print "<td><div class=\"legend\"><span class=\"legend_title\">N-gram Colors => Number of Matching Reference Translations</span><table border=0>";for(my $k = 1; $k <= scalar(@truthfiles); $k++){ print "<tr><td style=\"width: 15px; height: 15px; background: " . getNgramColorHTML($k, scalar(@truthfiles)) . "\"></td><td align=left style=\"padding-left: 12px\">$k</td></tr>";}print "</table></div></td></tr></table><div style=\"font-weight: bold; margin-bottom: 15px\">PWER errors are marked in red on output sentence displays.</div><div style=\"margin-bottom: 8px\">Color by system # " . join(' | ', map {"<a href=\"javascript:selectSysout($_);\">$_</a>" . (($_ == '0') ? " (default)" : "")} (0 .. scalar(@sysoutfiles) - 1)) . "</div><div style=\"margin-bottom: 8px\">Sort by <a href=\"javascript:sortByBLEU();\">BLEU score</a> | <a href=\"javascript:sortByCorpusOrder();\">corpus order</a> (default)</div>\n";#sentence boxesprint "<div id=\"all_sentences\">";for(my $j = 0; $j < scalar(@htmlSentences); $j++){ print "<div id=\"everything$j\" style=\"margin: 0px; padding: 0px\">"; print "<div class=\"ranks_container\" style=\"display: none\">" . join('', map {"<span>sentence_tier$_</span>"} @{$bleuTiers[$j]}) . "</div>"; print "<hr width=98%>";# my $bgcolor = getSentenceBGColorHTML($bleuScores[0]->[$j], $i); #i is now # of sentences my $tierNum = $bleuTiers[$j]->[0]; $htmlSentences[$j] =~ s/%%%%/tier$tierNum/; print "$htmlSentences[$j]</div>\n";}print "</div></body></html>";##################### utils ######################arguments: a, b (scalars)sub min{ my ($a, $b) = @_; return ($a < $b) ? $a : $b;}#arguments: a, b (scalars)sub max{ my ($a, $b) = @_; return ($a > $b) ? $a : $b;}#arguments: a list of elements#return undef for an empty list, the max element otherwisesub maxN{ if(scalar @_ == 0) {return undef;} my $val = shift @_; foreach my $e (@_) {if($e > $val) {$val = $e;}} return $val;}#arguments: xsub my_log{ return -9999999999 unless $_[0]; return log($_[0]);}#arguments: xsub round
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -