📄 newsmtgui.cgi.svn-base
字号:
#!/usr/bin/perl -wuse strict;use CGI;use Corpus; #Evan's codeuse Error qw(:try);#files with extensions other than these are interpreted as system translations; see the file 'file-descriptions', if it exists, for the comments that go with themmy %FILETYPE = ('e' => 'Reference Translation', 'f' => 'Foreign Original', 'ref.sgm' => 'Reference Translations', 'e.sgm' => 'Reference Translations', 'src.sgm' => 'Foreign Originals', 'f.sgm' => 'Foreign Originals');my %DONTSCORE = ('f' => 1, 'f.sgm' => 1, 'src.sgm' => 1, 'e' => 1, 'e.sgm' => 1, 'ref.sgm' => 1);my @SHOW = ('f', 'e', 'comm');my %SHOW_COLOR = ('f' => "BLUE", 'e' => "GREEN");my $FOREIGN = 'f';#FILEDESC: textual descriptions associated with specific filenames; to be displayed on the single-corpus viewmy %FILEDESC = (); &load_descriptions();my %factorData = loadFactorData('file-factors'); my %MEMORY; &load_memory();my (@mBLEU,@NIST);@mBLEU=`cat mbleu-memory.dat` if -e "mbleu-memory.dat"; chop(@mBLEU);@NIST = `cat nist-memory.dat` if -e "nist-memory.dat"; chop(@NIST);my %in; &ReadParse(); #parse argumentsif (scalar(@ARGV) > 0 && $ARGV[0] eq 'bleu') { $in{CORPUS} = $ARGV[1]; $in{ACTION} = "VIEW_CORPUS";}my %MULTI_REF;if ($in{CORPUS} && -e "$in{CORPUS}.ref.sgm") { my $sysid; open(REF,"$in{CORPUS}.ref.sgm"); while(<REF>) { $sysid = $1 if /<DOC.+sysid=\"([^\"]+)\"/; if (/<seg[^>]*> *(\S.+\S) *<\/seg>/) { push @{$MULTI_REF{$sysid}}, $1; } } close(REF);}if ($in{ACTION} eq '') { &show_corpora(); }elsif ($in{ACTION} eq 'VIEW_CORPUS') { &view_corpus(); }elsif ($in{ACTION} eq 'SCORE_FILE') { &score_file(); }elsif ($in{ACTION} eq 'RESCORE_FILE') { &score_file(); }elsif ($in{ACTION} eq 'COMPARE') { &compare(); }else { &htmlhead("Unknown Action $in{ACTION}"); }print "</BODY></HTML>\n";###### SHOW CORPORA IN EVALUATION DIRECTORYsub show_corpora { my %CORPUS = (); # find corpora in evaluation directory: see the factor-index file, which was already read in foreach my $corpusName (keys %factorData) { $CORPUS{$corpusName} = 1; } # list corpora &htmlhead("All Corpora"); print "<UL>\n"; foreach (sort (keys %CORPUS)) { print "<LI><A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($_)."\">Corpus $_</A>\n"; } print "</UL>\n";}###### SHOW INFORMATION FOR ONE CORPUSsub view_corpus { my @TABLE; &htmlhead("View Corpus $in{CORPUS}"); # find corpora in evaluation directory my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-info_line' => $factorData{$in{CORPUS}});# $corpus->printDetails(); #debugging info my ($sentence_count, $lineInfo); if(-e "$in{CORPUS}.f") { $lineInfo = `wc -l $in{CORPUS}.f`; $lineInfo =~ /^\s*(\d+)\s+/; $sentence_count = 0 + $1; } else { $lineInfo = `wc -l $in{CORPUS}.e`; $lineInfo =~ /^\s*(\d+)\s+/; $sentence_count = 0 + $1; } print "Corpus '$in{CORPUS}' consists of $sentence_count sentences\n"; print "(<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&mBLEU=1>with mBLEU</A>)" if ((!defined($in{mBLEU})) && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f"); print "<P>\n"; print "<FORM ACTION=''>\n"; print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=COMPARE>\n"; print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; print "<TABLE BORDER=1 CELLSPACING=0><TR><TD>File (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS}).">sort</A>)</TD><TD>Date (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=TIME>sort</A>)</TD>"; if (-e "$in{CORPUS}.e") { print "<TD>IBM BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=IBM>sort</A>)</TD>"; } if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm") { print "<TD>NIST (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=NIST>sort</A>)</TD>"; if (! -e "$in{CORPUS}.e") { print "<TD>BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=BLEU>sort</A>)</TD>"; } } if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f") { print "<TD>mBLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=mBLEU>sort</A>)</TD>"; } print "<TD>Unknown Words</TD>"; #can't sort on; only applies to the input print "<TD>Perplexity</TD>"; #applies to truth and system outputs print "<TD>WER (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=WER>sort</A>)</TD>"; print "<TD>Noun & adj WER-PWER</TD>"; #can't sort on; only applies to sysoutputs print "<TD>Surface vs. lemma PWER</TD>"; #can't sort on; only applies to sysoutputs print "<TD>Statistical Measures</TD>"; opendir(DIR, ".") or die "couldn't open '.' for read"; my @filenames = readdir(DIR); #includes . and .. closedir(DIR); foreach $_ (@filenames) { next if -d $_; #if is a directory my $sgm = 0; if (/.sgm$/) { `grep '<seg' $_ | wc -l` =~ /^\s*(\d+)\s+/; next unless $1 == $sentence_count; $sgm = 1; } else { `wc -l $_` =~ /^\s*(\d+)\s+/; next unless $1 == $sentence_count; } next unless /^$in{CORPUS}\.([^\/]+)$/; my $file = $1; my $sort = ""; # checkbox for compare my $row = "<TR><TD style=\"font-size: small\"><INPUT TYPE=CHECKBOX NAME=FILE_$file VALUE=1>"; # README if (-e "$in{CORPUS}.$file.README") { my $readme = `cat $in{CORPUS}.$file.README`; $readme =~ s/([\"\'])/\\\"/g; $readme =~ s/[\n\r]/\\n/g; $readme =~ s/\t/\\t/g; $row .= "<A HREF='javascript:FieldInfo(\"$in{CORPUS}.$file\",\"$readme\")'>"; } # filename $row .= "$file</A>"; # description (hard-coded) my @TRANSLATION_SENTENCE = `cat $in{CORPUS}.$file`; chop(@TRANSLATION_SENTENCE); #count sentences that contain null words my $null_count = 0; foreach (@TRANSLATION_SENTENCE) { $null_count++ if /^NULL$/ || /^NONE$/; } if ($null_count > 0) { $row .= "$null_count NULL "; } $row .= " (".$FILETYPE{$file}.")" if defined($FILETYPE{$file}); $row .= " (".$FILEDESC{$in{CORPUS}.".".$file}.")" if defined($FILEDESC{$in{CORPUS}.".".$file}); $row .= " (".$FILEDESC{$file}.")" if defined($FILEDESC{$file}); # filedate my @STAT = stat("$in{CORPUS}.$file"); my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($STAT[8]); #STAT[8] should be last modify time my $time = sprintf("%04d-%02d-%02d %02d:%02d:%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec); $row .= "</TD>\n<TD>".$time."</TD>\n"; if (defined($in{SORT}) && $in{SORT} eq 'TIME') { $sort = $time; } # IBM BLEU score my $no_bleu =0; if (!$sgm && -e "$in{CORPUS}.e") { $row .= "<TD>"; if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e" && $file !~ /^pt/) { my ($score,$p1,$p2,$p3,$p4,$bp) = $corpus->calcBLEU($file, 'surf'); print STDERR "193: `$score `$p1 `$p2 `$p3 `$p4 `$bp\n"; $row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f", $score, $p1, $p2, $p3, $p4, $bp); if (defined($in{SORT}) && $in{SORT} eq 'IBM') { $sort = $score; } } $row .= "</TD>\n"; } else { $no_bleu=1; } # NIST score if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm" && !$DONTSCORE{$file}) { $row .= "<TD>"; print "$DONTSCORE{$file}+"; my ($nist,$nist_bleu); if ($file =~ /sgm$/) { ($nist,$nist_bleu) = get_nist_score("$in{CORPUS}.ref.sgm","$in{CORPUS}.src.sgm","$in{CORPUS}.$file"); $row .= sprintf("<B>%.04f</B>",$nist); if ($in{SORT} eq 'NIST') { $sort = $nist; } } $row .= "</TD>\n"; if ($no_bleu) { $row .= "<TD>"; if ($file =~ /sgm$/) { $row .= sprintf("<B>%.04f</B>",$nist_bleu); if ($in{SORT} eq 'BLEU') { $sort = $nist_bleu; } } $row .= "</TD>\n"; } } # multi-bleu if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e") { $row .= "<TD>"; if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e") { my ($score,$p1,$p2,$p3,$p4,$bp) = get_multi_bleu_score("$in{CORPUS}.f","$in{CORPUS}.e","$in{CORPUS}.$file"); $row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f",$score,$p1,$p2,$p3,$p4,$bp); if ($in{SORT} eq 'mBLEU') { $sort = $score; } } $row .= "</TD>\n"; } my $isSystemOutput = ($file ne 'e' && $file ne 'f' && $file !~ /^pt/); # misc stats (note the unknown words should come first so the total word count is available for WER) $row .= "<TD align=\"center\">"; if($file eq 'f') #input { try { my ($unknownCount, $totalCount) = calc_unknown_words($corpus, 'surf'); $row .= sprintf("%.4lf (%d / %d)", $unknownCount / $totalCount, $unknownCount, $totalCount); } catch Error::Simple with {$row .= "[system error]";}; } $row .= "</TD>\n<TD align=\"center\">"; if($file eq 'e' || $file eq 'f' || $isSystemOutput) { try { my $perplexity = $corpus->calcPerplexity(($file eq 'e') ? 'truth' : (($file eq 'f') ? 'input' : $file), 'surf'); $row .= sprintf("%.2lf", $perplexity); } catch Error::Simple with {$row .= "[system error]";} } $row .= "</TD>\n<TD align=\"center\">"; if($isSystemOutput) { try { my $surfaceWER = $corpus->calcOverallWER($file); $row .= sprintf("%.4lf", $surfaceWER); } catch Error::Simple with {$row .= "[system error]";}; } $row .= "</TD>\n<TD align=\"center\">"; my ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER); if($isSystemOutput) { try { ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER) = calc_misc_stats($corpus, $file); $row .= sprintf("WER = %.4lg<br>PWER = %.4lg<br><b>ratio = %.3lf</b>", $nnAdjWER, $nnAdjPWER, $nnAdjPWER / $nnAdjWER); } catch Error::Simple with {$row .= "[system error]";}; } $row .= "</TD>\n<TD align=\"center\">"; if($isSystemOutput) { if($surfPWER == -1) { $row .= "[system error]"; } else { my ($lemmaBLEU, $p1, $p2, $p3, $p4, $brevity) = $corpus->calcBLEU($file, 'lemma'); $row .= sprintf("surface = %.3lf<br>lemma = %.3lf<br><b>lemma BLEU = %.04f</b> %.01f/%.01f/%.01f/%.01f *%.03f", $surfPWER, $lemmaPWER, $lemmaBLEU, $p1, $p2, $p3, $p4, $brevity); } } $row .= "</TD>\n<TD align=\"center\">"; if($isSystemOutput) { try { my $testInfo = $corpus->statisticallyTestBLEUResults($file, 'surf'); my @tTestPValues = @{$testInfo->[0]}; my @confidenceIntervals = @{$testInfo->[1]}; $row .= "n-gram precision p-values (high p <=> consistent score):<br>t test " . join("/", map {sprintf("%.4lf", $_)} @tTestPValues); $row .= "<p>n-gram precision 95% intervals:<br>" . join(",<br>", map {sprintf("[%.4lf - %.4lf]", $_->[0], $_->[1])} @confidenceIntervals); my @bleuInterval = (approxBLEUFromNgramScores(map {$_->[0]} @confidenceIntervals), approxBLEUFromNgramScores(map {$_->[1]} @confidenceIntervals)); $row .= sprintf("<br><b>(BLEU: ~[%.4lf - %.4lf])</b>", $bleuInterval[0], $bleuInterval[1]); } catch Error::Simple with {$row .= "[system error]";} } $row .= "</TD>\n"; # correct sentence score my($correct,$wrong,$unknown); $row .= "<TD>"; if (!defined($DONTSCORE{$file}) && (scalar keys %MEMORY)) { my ($correct,$just_syn,$just_sem,$wrong,$unknown) = get_score_from_memory("$in{CORPUS}.$FOREIGN", "$in{CORPUS}.$file"); $row .= "<B><FONT COLOR=GREEN>$correct</FONT></B>"; $row .= "/<FONT COLOR=ORANGE>$just_syn</FONT>"; $row .= "/<FONT COLOR=ORANGE>$just_sem</FONT>"; $row .= "/<FONT COLOR=RED>$wrong</FONT> ($unknown)</TD>\n"; if ($in{SORT} eq 'SCORE') { $sort = sprintf("%03d %04d",$correct,$just_syn+$just_sem); } } else { $row .= "</TD>\n"; } $row .= "</TR>\n"; push @TABLE, "<!-- $sort -->\n$row"; } close(DIR); foreach (reverse sort @TABLE) { print $_; } print "</TABLE>\n"; print "<INPUT TYPE=SUBMIT VALUE=\"Compare\">\n"; print "<INPUT TYPE=CHECKBOX NAME=SURFACE VALUE=1 CHECKED> Compare all different sentences (instead of just differently <I>evaluated</I> sentences) <INPUT TYPE=CHECKBOX NAME=WITH_EVAL VALUE=1 CHECKED> with evaluation</FORM><P>\n"; print "<P>The score is to be read as: <FONT COLOR=GREEN>correct</FONT>/<FONT COLOR=ORANGE>just-syn-correct</FONT>/<FONT COLOR=ORANGE>just-sem-correct</FONT>/<FONT COLOR=RED>wrong</FONT> (unscored)\n"; print "<BR>IBM BLEU is to be read as: <B>metric</B> unigram/bigram/trigram/quadgram *brevity-penalty<P>";
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -