📄 newsmtgui.cgi
字号:
print "<BR>IBM BLEU is to be read as: <B>metric</B> unigram/bigram/trigram/quadgram *brevity-penalty<P>";
print "<DIV STYLE=\"border: 1px solid #006600\">";
print "<H2>Comparison of System Translations (p-values)</H2>";
my @sysnames = $corpus->getSystemNames();
for(my $i = 0; $i < scalar(@sysnames); $i++)
{
for(my $j = $i + 1; $j < scalar(@sysnames); $j++)
{
my $comparison = $corpus->statisticallyCompareSystemResults($sysnames[$i], $sysnames[$j], 'surf');
print "<P><FONT COLOR=#00aa22>" . $sysnames[$i] . " vs. " . $sysnames[$j] . "</FONT>: [<I>t</I> test] ";
for(my $k = 0; $k < scalar(@{$comparison->[0]}); $k++)
{
print sprintf(($k == 0) ? "%.4lg" : "; %.4lg ", $comparison->[0]->[$k]);
if($comparison->[1]->[$k] == 0) {print "(←)";} else {print "(→)";}
}
print " --- [sign test] ";
for(my $k = 0; $k < scalar(@{$comparison->[2]}); $k++)
{
print sprintf(($k == 0) ? "%.4lg " : "; %.4lg ", $comparison->[2]->[$k]);
if($comparison->[3]->[$k] == 0) {print "(←)";} else {print "(→)";}
}
print "\n";
}
}
print "</DIV\n";
print "<P><A HREF=\"newsmtgui.cgi?action=\">All corpora</A>\n";
}
###### SCORE TRANSLATIONS
sub score_file {
if ($in{VIEW}) {
&htmlhead("View Translations");
}
else {
&htmlhead("Score Translations");
}
print "<A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($in{CORPUS})."\">View Corpus $in{CORPUS}</A><P>\n";
print "<FORM ACTION=\"\" METHOD=POST>\n";
print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=$in{ACTION}>\n";
print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n";
print "<INPUT TYPE=HIDDEN NAME=FILE VALUE=\"$in{FILE}\">\n";
# get sentences
my @SENTENCES;
if ($in{FILE} =~ /.sgm$/) {
@SENTENCES = `grep '<seg' $in{CORPUS}.$in{FILE}`;
for(my $i=0;$i<$#SENTENCES;$i++) {
$SENTENCES[$i] =~ s/^<seg[^>]+> *(\S.+\S) *<\/seg> *$/$1/;
}
}
else {
@SENTENCES = `cat $in{CORPUS}.$in{FILE}`; chop(@SENTENCES);
}
my %REFERENCE;
foreach (@SHOW) {
if (-e "$in{CORPUS}.$_") {
@{$REFERENCE{$_}} = `cat $in{CORPUS}.$_`; chop(@{$REFERENCE{$_}});
}
}
# update memory
foreach (keys %in) {
next unless /^SYN_SCORE_(\d+)$/;
next unless $in{"SEM_SCORE_$1"};
&store_in_memory($REFERENCE{$FOREIGN}[$1],
$SENTENCES[$1],
"syn_".$in{"SYN_SCORE_$1"}." sem_".$in{"SEM_SCORE_$1"});
}
# display sentences
for(my $i=0;$i<=$#SENTENCES;$i++) {
my $evaluation = &get_from_memory($REFERENCE{$FOREIGN}[$i],$SENTENCES[$i]);
next if ($in{ACTION} eq 'SCORE_FILE' &&
! $in{VIEW} &&
$evaluation ne '' && $evaluation ne 'wrong');
print "<P>Sentence ".($i+1).":<BR>\n";
# color coding
&color_highlight_ngrams($i,&nist_normalize_text($SENTENCES[$i]),$REFERENCE{"e"}[$i]);
if (%MULTI_REF) {
foreach my $sysid (keys %MULTI_REF) {
print "<FONT COLOR=GREEN>".$MULTI_REF{$sysid}[$i]."</FONT> (Reference $sysid)<BR>\n";
}
}
# all sentences
print "$SENTENCES[$i] (System output)<BR>\n";
foreach my $ref (@SHOW) {
if (-e "$in{CORPUS}.$ref") {
print "<FONT COLOR=$SHOW_COLOR{$ref}>".$REFERENCE{$ref}[$i]."</FONT> (".$FILETYPE{$ref}.")<BR>\n" if $REFERENCE{$ref}[$i];
}
}
if (! $in{VIEW}) {
print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=correct";
print " CHECKED" if ($evaluation =~ /syn_correct/);
print "> perfect English\n";
print "<INPUT TYPE=RADIO NAME=SYN_SCORE_$i VALUE=wrong";
print " CHECKED" if ($evaluation =~ /syn_wrong/);
print "> imperfect English<BR>\n";
print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=correct";
print " CHECKED" if ($evaluation =~ /sem_correct/);
print "> correct meaning\n";
print "<INPUT TYPE=RADIO NAME=SEM_SCORE_$i VALUE=wrong";
print " CHECKED" if ($evaluation =~ /sem_wrong/);
print "> incorrect meaning\n";
}
}
if (! $in{VIEW}) {
print "<P><INPUT TYPE=SUBMIT VALUE=\"Add evaluation\">\n";
print "</FORM>\n";
}
}
sub color_highlight_ngrams {
my($i,$sentence,$single_reference) = @_;
my @REF = ();
my %NGRAM = ();
if (%MULTI_REF) {
foreach my $sysid (keys %MULTI_REF) {
push @REF,&nist_normalize_text($MULTI_REF{$sysid}[$i]);
}
}
elsif ($single_reference) {
@REF = ($single_reference);
}
if (@REF) {
foreach my $ref (@REF) {
my @WORD = split(/\s+/,$ref);
for(my $n=1;$n<=4;$n++) {
for(my $w=0;$w<=$#WORD-($n-1);$w++) {
my $ngram = "$n: ";
for(my $j=0;$j<$n;$j++) {
$ngram .= $WORD[$w+$j]." ";
}
$NGRAM{$ngram}++;
}
}
}
$sentence =~ s/^\s+//;
$sentence =~ s/\s+/ /;
$sentence =~ s/\s+$//;
my @WORD = split(/\s+/,$sentence);
my @CORRECT;
for(my $w=0;$w<=$#WORD;$w++) {
$CORRECT[$w] = 0;
}
for(my $n=1;$n<=4;$n++) {
for(my $w=0;$w<=$#WORD-($n-1);$w++) {
my $ngram = "$n: ";
for(my $j=0;$j<$n;$j++) {
$ngram .= $WORD[$w+$j]." ";
}
next unless defined($NGRAM{$ngram}) && $NGRAM{$ngram}>0;
$NGRAM{$ngram}--;
for(my $j=0;$j<$n;$j++) {
$CORRECT[$w+$j] = $n;
}
}
}
my @COLOR;
$COLOR[0] = "#FF0000";
$COLOR[1] = "#C000C0";
$COLOR[2] = "#0000FF";
$COLOR[3] = "#00C0C0";
$COLOR[4] = "#00C000";
for(my $w=0;$w<=$#WORD;$w++) {
print "<B><FONT COLOR=".$COLOR[$CORRECT[$w]].">$WORD[$w]<SUB>".$CORRECT[$w]."</SUB></FONT></B> ";
}
print "\n<BR>";
}
}
###### OTHER STATS
#print (in some unspecified way) the offending exception of type Error::Simple
#arguments: the error object, a context string
#return: none
sub printError
{
my ($err, $context) = @_;
warn "$context: " . $err->{'-text'} . " @ " . $err->{'-file'} . " (" .$err->{'-line'} . ")\n";
}
#compute number and percentage of unknown tokens for a given factor in foreign corpus
#arguments: corpus object ref, factor name
#return (unkwordCount, totalWordCount), or (-1, -1) if an error occurs
sub calc_unknown_words
{
my ($corpus, $factorName) = @_;
try
{
my ($unknownCount, $totalCount) = $corpus->calcUnknownTokens($factorName);
return ($unknownCount, $totalCount);
}
catch Error::Simple with
{
my $err = shift;
printError($err, 'calc_unknown_words()');
return (-1, -1);
};
}
#compute (if we have the necessary factors) info for:
#- diff btwn wer and pwer for NNs & ADJs -- if large, many reordering errors
#- diff btwn pwer for surface forms and pwer for lemmas -- if large, morphology errors
#arguments: corpus object, system name
#return (NN/ADJ (wer, pwer), surf pwer, lemma pwer), or (-1, -1, -1, -1) if an error occurs
sub calc_misc_stats
{
my ($corpus, $sysname) = @_;
try
{
my ($nnAdjWER, $nnAdjPWER) = $corpus->calcNounAdjWER_PWERDiff($sysname);
my ($surfPWER, $lemmaPWER) = ($corpus->calcOverallPWER($sysname, 'surf'), $corpus->calcOverallPWER($sysname, 'lemma'));
return ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER);
}
catch Error::Simple with
{
my $err = shift;
printError($err, 'calc_misc_stats()');
return (-1, -1, -1, -1);
};
}
#approximate BLEU score from n-gram precisions (currently assume no length penalty)
#arguments: n-gram precisions as an array
#return: BLEU score
sub approxBLEUFromNgramScores
{
my $logsum = 0;
foreach my $p (@_) {$logsum += log($p);}
return exp($logsum / scalar(@_));
}
###### NIST SCORE
sub get_nist_score {
my($reference_file,$source_file,$translation_file) = @_;
my @STAT = stat($translation_file);
my $current_timestamp = $STAT[9];
foreach (@NIST) {
my ($file,$time,$nist,$bleu) = split;
return ($nist,$bleu)
if ($file eq $translation_file && $current_timestamp == $time);
}
my $nist_eval = `/home/pkoehn/statmt/bin/mteval-v10.pl -c -r $reference_file -s $source_file -t $translation_file`;
return (0,0) unless ($nist_eval =~ /NIST score = (\d+\.\d+) BLEU score = (\d+\.\d+)/i);
open(NIST,">>nist-memory.dat");
printf NIST "$translation_file $current_timestamp %f %f\n",$1,$2;
close(NIST);
return ($1,$2);
}
sub nist_normalize_text {
my ($norm_text) = @_;
# language-independent part:
$norm_text =~ s/<skipped>//g; # strip "skipped" tags
$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
$norm_text =~ s/\n/ /g; # join lines
$norm_text =~ s/(\d)\s+(\d)/$1$2/g; #join digits
$norm_text =~ s/"/"/g; # convert SGML tag for quote to "
$norm_text =~ s/&/&/g; # convert SGML tag for ampersand to &
$norm_text =~ s/</</g; # convert SGML tag for less-than to >
$norm_text =~ s/>/>/g; # convert SGML tag for greater-than to <
# language-dependent part (assuming Western languages):
$norm_text = " $norm_text ";
# $norm_text =~ tr/[A-Z]/[a-z]/ unless $preserve_case;
$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g; # tokenize punctuation
$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
$norm_text =~ s/\s+/ /g; # one space only between words
$norm_text =~ s/^\s+//; # no leading space
$norm_text =~ s/\s+$//; # no trailing space
return $norm_text;
}
###### BLEU SCORE
sub get_multi_bleu_score {
my($foreign_file,$reference_file,$translation_file) = @_;
my @STAT = stat($translation_file);
my $current_timestamp = $STAT[9];
foreach (@mBLEU) {
my ($file,$time,$score,$g1,$g2,$g3,$g4,$bp) = split;
if ($file eq $translation_file && $current_timestamp == $time) {
return ($score,$g1*100,$g2*100,$g3*100,$g4*100,$bp);
}
}
# load reference translation from reference file
my @REFERENCE_SENTENCE = `cat $reference_file`; chop(@REFERENCE_SENTENCE);
my @TRANSLATION_SENTENCE = `cat $translation_file`; chop(@TRANSLATION_SENTENCE);
my %REF;
my @FOREIGN_SENTENCE = `cat $foreign_file`; chop(@FOREIGN_SENTENCE);
for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) {
push @{$REF{$FOREIGN_SENTENCE[$i]}},$REFERENCE_SENTENCE[$i];
}
# load reference translation from translation memory
foreach my $memory (keys %MEMORY) {
next if $MEMORY{$memory} ne 'syn_correct sem_correct';
my ($foreign,$english) = split(/ .o0O0o. /,$memory);
next unless defined($REF{$foreign});
push @{$REF{$foreign}},$english;
}
my(@CORRECT,@TOTAL,$length_translation,$length_reference);
# compute bleu
for(my $i=0;$i<=$#TRANSLATION_SENTENCE;$i++) {
my %REF_NGRAM = ();
my @WORD = split(/ /,$TRANSLATION_SENTENCE[$i]);
my $length_translation_this_sentence = scalar(@WORD);
my ($closest_diff,$closest_length) = (9999,9999);
foreach my $reference (@{$REF{$FOREIGN_SENTENCE[$i]}}) {
my @WORD = split(/ /,$reference);
my $length = scalar(@WORD);
if (abs($length_translation_this_sentence-$length) < $closest_diff) {
$closest_diff = abs($length_translation_this_sentence-$length);
$closest_length = $length;
}
for(my $n=1;$n<=4;$n++) {
my %REF_NGRAM_N = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$REF_NGRAM_N{$ngram}++;
}
foreach my $ngram (keys %REF_NGRAM_N) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -