📄 newsmtgui.cgi.svn-base
字号:
print "<DIV STYLE=\"border: 1px solid #006600\">"; print "<H2>Comparison of System Translations (p-values)</H2>"; my @sysnames = $corpus->getSystemNames(); for(my $i = 0; $i < scalar(@sysnames); $i++) { for(my $j = $i + 1; $j < scalar(@sysnames); $j++) { my $comparison = $corpus->statisticallyCompareSystemResults($sysnames[$i], $sysnames[$j], 'surf'); print "<P><FONT COLOR=#00aa22>" . $sysnames[$i] . " vs. " . $sysnames[$j] . "</FONT>: [<I>t</I> test] "; for(my $k = 0; $k < scalar(@{$comparison->[0]}); $k++) { print sprintf(($k == 0) ? "%.4lg" : "; %.4lg ", $comparison->[0]->[$k]); if($comparison->[1]->[$k] == 0) {print "(←)";} else {print "(→)";} } print " --- [sign test] "; for(my $k = 0; $k < scalar(@{$comparison->[2]}); $k++) { print sprintf(($k == 0) ? "%.4lg " : "; %.4lg ", $comparison->[2]->[$k]); if($comparison->[3]->[$k] == 0) {print "(←)";} else {print "(→)";} } print "\n"; } } print "</DIV\n"; print "<P><A HREF=\"newsmtgui.cgi?action=\">All corpora</A>\n";}###### SCORE TRANSLATIONSsub score_file { if ($in{VIEW}) { &htmlhead("View Translations"); } else { &htmlhead("Score Translations"); } print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n"; print "<FORM ACTION=\"\" METHOD=POST>\n"; print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n"; print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n"; print "<INPUT TYPE=HIDDEN NAME=FILE VALUE=\"$in{FILE}\">\n"; # get sentences my @SENTENCES; if ($in{FILE} =~ /.sgm$/) { @SENTENCES = `grep '<seg' $in{CORPUS}.$in{FILE}`; for(my $i=0;$i<$#SENTENCES;$i++) { $SENTENCES[$i] =~ s/^<seg[^>]+> *(\S.+\S) *<\/seg> *$/$1/; } } else { @SENTENCES = `cat $in{CORPUS}.$in{FILE}`; chop(@SENTENCES); } my %REFERENCE; foreach (@SHOW) { if (-e "$in{CORPUS}.$_") { @{$REFERENCE{$_}} = `cat $in{CORPUS}.$_`; chop(@{$REFERENCE{$_}}); } } # update memory foreach (keys %in) { next unless /^SYN_SCORE_(\d+)$/; next unless $in{"SEM_SCORE_$1"}; &store_in_memory($REFERENCE{$FOREIGN}[$1], $SENTENCES[$1], "syn_".$in{"SYN_SCORE_$1"}." sem_".$in{"SEM_SCORE_$1"}); } # display sentences for(my $i=0;$i<=$#SENTENCES;$i++) { my $evaluation = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES[$i]); next if ($in{ACTION} eq 'SCORE_FILE' && ! $in{VIEW} && $evaluation ne '' && $evaluation ne 'wrong'); print "<P>Sentence ".($i+1).":<BR>\n"; # color coding &color_highlight_ngrams($i,&nist_normalize_text($SENTENCES[$i]),$REFERENCE{"e"}[$i]); if (%MULTI_REF) { foreach my $sysid (keys %MULTI_REF) { print "<FONT COLOR=GREEN>".$MULTI_REF{$sysid}[$i]."</FONT> (Reference $sysid)<BR>\n"; } } # all sentences print "$SENTENCES[$i] (System output)<BR>\n"; foreach my $ref (@SHOW) { if (-e "$in{CORPUS}.$ref") { print "<FONT COLOR=$SHOW_COLOR{$ref}>".$REFERENCE{$ref}[$i]."</FONT> (".$FILETYPE{$ref}.")<BR>\n" if $REFERENCE{$ref}[$i]; } } if (! $in{VIEW}) { print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=correct"; print " CHECKED" if ($evaluation =~ /syn_correct/); print "> perfect English\n"; print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=wrong"; print " CHECKED" if ($evaluation =~ /syn_wrong/); print "> imperfect English<BR>\n"; print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=correct"; print " CHECKED" if ($evaluation =~ /sem_correct/); print "> correct meaning\n"; print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=wrong"; print " CHECKED" if ($evaluation =~ /sem_wrong/); print "> incorrect meaning\n"; } } if (! $in{VIEW}) { print "<P><INPUT TYPE=SUBMIT VALUE=\"Add evaluation\">\n"; print "</FORM>\n"; }}sub color_highlight_ngrams { my($i,$sentence,$single_reference) = @_; my @REF = (); my %NGRAM = (); if (%MULTI_REF) { foreach my $sysid (keys %MULTI_REF) { push @REF,&nist_normalize_text($MULTI_REF{$sysid}[$i]); } } elsif ($single_reference) { @REF = ($single_reference); } if (@REF) { foreach my $ref (@REF) { my @WORD = split(/\s+/,$ref); for(my $n=1;$n<=4;$n++) { for(my $w=0;$w<=$#WORD-($n-1);$w++) { my $ngram = "$n: "; for(my $j=0;$j<$n;$j++) { $ngram .= $WORD[$w+$j]." "; } $NGRAM{$ngram}++; } } } $sentence =~ s/^\s+//; $sentence =~ s/\s+/ /; $sentence =~ s/\s+$//; my @WORD = split(/\s+/,$sentence); my @CORRECT; for(my $w=0;$w<=$#WORD;$w++) { $CORRECT[$w] = 0; } for(my $n=1;$n<=4;$n++) { for(my $w=0;$w<=$#WORD-($n-1);$w++) { my $ngram = "$n: "; for(my $j=0;$j<$n;$j++) { $ngram .= $WORD[$w+$j]." "; } next unless defined($NGRAM{$ngram}) && $NGRAM{$ngram}>0; $NGRAM{$ngram}--; for(my $j=0;$j<$n;$j++) { $CORRECT[$w+$j] = $n; } } } my @COLOR; $COLOR[0] = "#FF0000"; $COLOR[1] = "#C000C0"; $COLOR[2] = "#0000FF"; $COLOR[3] = "#00C0C0"; $COLOR[4] = "#00C000"; for(my $w=0;$w<=$#WORD;$w++) { print "<B><FONT COLOR=".$COLOR[$CORRECT[$w]].">$WORD[$w]<SUB>".$CORRECT[$w]."</SUB></FONT></B> "; } print "\n<BR>"; }}###### OTHER STATS#print (in some unspecified way) the offending exception of type Error::Simple#arguments: the error object, a context string#return: nonesub printError{ my ($err, $context) = @_; warn "$context: " . $err->{'-text'} . " @ " . $err->{'-file'} . " (" .$err->{'-line'} . ")\n";}#compute number and percentage of unknown tokens for a given factor in foreign corpus#arguments: corpus object ref, factor name#return (unkwordCount, totalWordCount), or (-1, -1) if an error occurssub calc_unknown_words{ my ($corpus, $factorName) = @_; try { my ($unknownCount, $totalCount) = $corpus->calcUnknownTokens($factorName); return ($unknownCount, $totalCount); } catch Error::Simple with { my $err = shift; printError($err, 'calc_unknown_words()'); return (-1, -1); };}#compute (if we have the necessary factors) info for:#- diff btwn wer and pwer for NNs & ADJs -- if large, many reordering errors#- diff btwn pwer for surface forms and pwer for lemmas -- if large, morphology errors#arguments: corpus object, system name#return (NN/ADJ (wer, pwer), surf pwer, lemma pwer), or (-1, -1, -1, -1) if an error occurssub calc_misc_stats{ my ($corpus, $sysname) = @_; try { my ($nnAdjWER, $nnAdjPWER) = $corpus->calcNounAdjWER_PWERDiff($sysname); my ($surfPWER, $lemmaPWER) = ($corpus->calcOverallPWER($sysname, 'surf'), $corpus->calcOverallPWER($sysname, 'lemma')); return ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER); } catch Error::Simple with { my $err = shift; printError($err, 'calc_misc_stats()'); return (-1, -1, -1, -1); };}#approximate BLEU score from n-gram precisions (currently assume no length penalty)#arguments: n-gram precisions as an array#return: BLEU scoresub approxBLEUFromNgramScores{ my $logsum = 0; foreach my $p (@_) {$logsum += log($p);} return exp($logsum / scalar(@_));}###### NIST SCOREsub get_nist_score { my($reference_file,$source_file,$translation_file) = @_; my @STAT = stat($translation_file); my $current_timestamp = $STAT[9]; foreach (@NIST) { my ($file,$time,$nist,$bleu) = split; return ($nist,$bleu) if ($file eq $translation_file && $current_timestamp == $time); } my $nist_eval = `/home/pkoehn/statmt/bin/mteval-v10.pl -c -r $reference_file -s $source_file -t $translation_file`; return (0,0) unless ($nist_eval =~ /NIST score = (\d+\.\d+) BLEU score = (\d+\.\d+)/i); open(NIST,">>nist-memory.dat"); printf NIST "$translation_file $current_timestamp %f %f\n",$1,$2; close(NIST); return ($1,$2);}sub nist_normalize_text { my ($norm_text) = @_;# language-independent part: $norm_text =~ s/<skipped>//g; # strip "skipped" tags $norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines $norm_text =~ s/\n/ /g; # join lines $norm_text =~ s/(\d)\s+(\d)/$1$2/g; #join digits $norm_text =~ s/"/"/g; # convert SGML tag for quote to " $norm_text =~ s/&/&/g; # convert SGML tag for ampersand to & $norm_text =~ s/</</g; # convert SGML tag for less-than to > $norm_text =~ s/>/>/g; # convert SGML tag for greater-than to <# language-dependent part (assuming Western languages): $norm_text = " $norm_text ";# $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case; $norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation $norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit $norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit $norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit $norm_text =~ s/\s+/ /g; # one space only between words $norm_text =~ s/^\s+//; # no leading space $norm_text =~ s/\s+$//; # no trailing space return $norm_text;}###### BLEU SCOREsub get_multi_bleu_score { my($foreign_file,$reference_file,$translation_file) = @_; my @STAT = stat($translation_file); my $current_timestamp = $STAT[9]; foreach (@mBLEU) { my ($file,$time,$score,$g1,$g2,$g3,$g4,$bp) = split; if ($file eq $translation_file && $current_timestamp == $time) { return ($score,$g1*100,$g2*100,$g3*100,$g4*100,$bp); } } # load reference translation from reference file my @REFERENCE_SENTENCE = `cat $reference_file`; chop(@REFERENCE_SENTENCE); my @TRANSLATION_SENTENCE = `cat $translation_file`; chop(@TRANSLATION_SENTENCE); my %REF; my @FOREIGN_SENTENCE = `cat $foreign_file`; chop(@FOREIGN_SENTENCE); for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) { push @{$REF{$FOREIGN_SENTENCE[$i]}},$REFERENCE_SENTENCE[$i]; } # load reference translation from translation memory foreach my $memory (keys %MEMORY) { next if $MEMORY{$memory} ne 'syn_correct sem_correct'; my ($foreign,$english) = split(/ .o0O0o. /,$memory); next unless defined($REF{$foreign}); push @{$REF{$foreign}},$english; } my(@CORRECT,@TOTAL,$length_translation,$length_reference); # compute bleu for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) { my %REF_NGRAM = (); my @WORD = split(/ /,$TRANSLATION_SENTENCE[$i]); my $length_translation_this_sentence = scalar(@WORD); my ($closest_diff,$closest_length) = (9999,9999); foreach my $reference (@{$REF{$FOREIGN_SENTENCE[$i]}}) { my @WORD = split(/ /,$reference); my $length = scalar(@WORD); if (abs($length_translation_this_sentence-$length) < $closest_diff) { $closest_diff = abs($length_translation_this_sentence-$length); $closest_length = $length; } for(my $n=1;$n<=4;$n++) { my %REF_NGRAM_N = (); for(my $start=0;$start<=$#WORD-($n-1);$start++) { my $ngram = "$n"; for(my $w=0;$w<$n;$w++) { $ngram .= " ".$WORD[$start+$w]; } $REF_NGRAM_N{$ngram}++; } foreach my $ngram (keys %REF_NGRAM_N) { if (!defined($REF_NGRAM{$ngram}) ||
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -