⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 newsmtgui.cgi

📁 moses开源的机器翻译系统
💻 CGI
📖 第 1 页 / 共 3 页
字号:
#!/usr/bin/perl -w

# $Id: newsmtgui.cgi 1307 2007-03-14 22:22:36Z hieuhoang1972 $
use strict;

use CGI;
use Corpus; #Evan's code
use Error qw(:try);

#files with extensions other than these are interpreted as system translations; see the file 'file-descriptions', if it exists, for the comments that go with them
my %FILETYPE = ('e' => 'Reference Translation',
		'f' => 'Foreign Original',
		'ref.sgm' => 'Reference Translations',
		'e.sgm' => 'Reference Translations',
		'src.sgm' => 'Foreign Originals',
		'f.sgm' => 'Foreign Originals');
my %DONTSCORE = ('f' => 1, 'f.sgm' => 1, 'src.sgm' => 1,
		 'e' => 1, 'e.sgm' => 1, 'ref.sgm' => 1);
my @SHOW = ('f', 'e', 'comm');
my %SHOW_COLOR = ('f' => "BLUE",
		  'e' => "GREEN");
my $FOREIGN = 'f';

#FILEDESC: textual descriptions associated with specific filenames; to be displayed on the single-corpus view
my %FILEDESC = (); &load_descriptions();
my %factorData = loadFactorData('file-factors'); 
my %MEMORY;        &load_memory();
my (@mBLEU,@NIST);
@mBLEU=`cat mbleu-memory.dat` if -e "mbleu-memory.dat"; chop(@mBLEU);
@NIST = `cat nist-memory.dat` if -e "nist-memory.dat"; chop(@NIST);
my %in;            &ReadParse(); #parse arguments

if (scalar(@ARGV) > 0 && $ARGV[0] eq 'bleu') {
  $in{CORPUS} = $ARGV[1];
  $in{ACTION} = "VIEW_CORPUS";
}

my %MULTI_REF;
if ($in{CORPUS} && -e "$in{CORPUS}.ref.sgm") {
  my $sysid;
  open(REF,"$in{CORPUS}.ref.sgm");
  while(<REF>) {
    $sysid = $1 if /<DOC.+sysid=\"([^\"]+)\"/;
    if (/<seg[^>]*> *(\S.+\S) *<\/seg>/) {
      push @{$MULTI_REF{$sysid}}, $1;
    }
  }
  close(REF);
}

if ($in{ACTION} eq '') { &show_corpora(); }
elsif ($in{ACTION} eq 'VIEW_CORPUS') { &view_corpus(); }
elsif ($in{ACTION} eq 'SCORE_FILE') { &score_file(); }
elsif ($in{ACTION} eq 'RESCORE_FILE') { &score_file(); }
elsif ($in{ACTION} eq 'COMPARE') { &compare(); }
else { &htmlhead("Unknown Action $in{ACTION}"); }
print "</BODY></HTML>\n";

###### SHOW CORPORA IN EVALUATION DIRECTORY

sub show_corpora {
  my %CORPUS = ();
  
  # find corpora in evaluation directory: see the factor-index file, which was already read in
  foreach my $corpusName (keys %factorData)
  {
  	$CORPUS{$corpusName} = 1;
  }
  
  # list corpora
  &htmlhead("All Corpora");
  print "<UL>\n";
  foreach (sort (keys %CORPUS)) {
    print "<LI><A HREF=\"?ACTION=VIEW_CORPUS&CORPUS=".CGI::escape($_)."\">Corpus $_</A>\n";
  }
  print "</UL>\n";
}

###### SHOW INFORMATION FOR ONE CORPUS

sub view_corpus {
  my @TABLE;
  &htmlhead("View Corpus $in{CORPUS}");
  
  # find corpora in evaluation directory
  my $corpus = new Corpus('-name' => "$in{CORPUS}", '-descriptions' => \%FILEDESC, '-info_line' => $factorData{$in{CORPUS}});
#  $corpus->printDetails(); #debugging info
  
  my ($sentence_count, $lineInfo);
  if(-e "$in{CORPUS}.f")
  {
  	$lineInfo = `wc -l $in{CORPUS}.f`;
  	$lineInfo =~ /^\s*(\d+)\s+/;
  	$sentence_count = 0 + $1;
	}
	else
	{
	  $lineInfo = `wc -l $in{CORPUS}.e`;
	  $lineInfo =~ /^\s*(\d+)\s+/;
	  $sentence_count = 0 + $1;
	}
  
  print "Corpus '$in{CORPUS}' consists of $sentence_count sentences\n";
  print "(<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&mBLEU=1>with mBLEU</A>)" if ((!defined($in{mBLEU})) && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f");
  print "<P>\n";
  print "<FORM ACTION=''>\n";
  print "<INPUT TYPE=HIDDEN NAME=ACTION VALUE=COMPARE>\n";
  print "<INPUT TYPE=HIDDEN NAME=CORPUS VALUE=\"$in{CORPUS}\">\n";
  print "<TABLE BORDER=1 CELLSPACING=0><TR>
<TD>File (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS}).">sort</A>)</TD>
<TD>Date (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=TIME>sort</A>)</TD>";
  if (-e "$in{CORPUS}.e") {
    print "<TD>IBM BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=IBM>sort</A>)</TD>";
  }
  if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm") {
    print "<TD>NIST (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=NIST>sort</A>)</TD>";
    if (! -e "$in{CORPUS}.e") {
      print "<TD>BLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=BLEU>sort</A>)</TD>";
    }
  }
  if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e" && -e "$in{CORPUS}.f") {
    print "<TD>mBLEU (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=mBLEU>sort</A>)</TD>";
  }
  print "<TD>Unknown Words</TD>"; #can't sort on; only applies to the input
  print "<TD>Perplexity</TD>"; #applies to truth and system outputs
  print "<TD>WER (<A HREF=?ACTION=VIEW_CORPUS&CORPUS=" . CGI::escape($in{CORPUS})."&SORT=WER>sort</A>)</TD>";
  print "<TD>Noun & adj WER-PWER</TD>"; #can't sort on; only applies to sysoutputs
  print "<TD>Surface vs. lemma PWER</TD>"; #can't sort on; only applies to sysoutputs
	print "<TD>Statistical Measures</TD>";

  opendir(DIR, ".") or die "couldn't open '.' for read";
  my @filenames = readdir(DIR); #includes . and ..
  closedir(DIR);
  foreach $_ (@filenames)
  {
  	next if -d $_; #if is a directory
    my $sgm = 0;
    if (/.sgm$/)
	 {
	 	`grep '<seg' $_ | wc -l` =~ /^\s*(\d+)\s+/;
		next unless $1 == $sentence_count;
		$sgm = 1;
    }
    else
	 {
	 	`wc -l $_` =~ /^\s*(\d+)\s+/;
		next unless $1 == $sentence_count;
    }
	 next unless /^$in{CORPUS}\.([^\/]+)$/;
    my $file = $1;
	 my $sort = "";
    # checkbox for compare
    my $row = "<TR><TD style=\"font-size: small\"><INPUT TYPE=CHECKBOX NAME=FILE_$file VALUE=1>";
    # README
    if (-e "$in{CORPUS}.$file.README") {
      my $readme = `cat $in{CORPUS}.$file.README`;
      $readme =~ s/([\"\'])/\\\"/g;
      $readme =~ s/[\n\r]/\\n/g;
      $readme =~ s/\t/\\t/g;
      $row .= "<A HREF='javascript:FieldInfo(\"$in{CORPUS}.$file\",\"$readme\")'>";
    }
    # filename
    $row .= "$file</A>";
    # description (hard-coded)
    my @TRANSLATION_SENTENCE = `cat $in{CORPUS}.$file`; 
    chop(@TRANSLATION_SENTENCE);
    
	 #count sentences that contain null words
	 my $null_count = 0;
    foreach (@TRANSLATION_SENTENCE)
	 {
      $null_count++ if /^NULL$/ || /^NONE$/;
    }
    if ($null_count > 0) { 
      $row .= "$null_count NULL ";
    }

    $row .= " (".$FILETYPE{$file}.")" if defined($FILETYPE{$file});
    $row .= " (".$FILEDESC{$in{CORPUS}.".".$file}.")" if defined($FILEDESC{$in{CORPUS}.".".$file});
    $row .= " (".$FILEDESC{$file}.")" if defined($FILEDESC{$file});
    # filedate
    my @STAT = stat("$in{CORPUS}.$file");
    my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($STAT[8]); #STAT[8] should be last modify time
    my $time = sprintf("%04d-%02d-%02d %02d:%02d:%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec);
    $row .= "</TD>\n<TD>".$time."</TD>\n";
    if (defined($in{SORT}) && $in{SORT} eq 'TIME') { $sort = $time; }
    # IBM BLEU score
    my $no_bleu =0;
    if (!$sgm && -e "$in{CORPUS}.e") {
      $row .= "<TD>";
      if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e" && $file !~ /^pt/) {
	my ($score,$p1,$p2,$p3,$p4,$bp) = $corpus->calcBLEU($file, 'surf');
	print STDERR "193: `$score `$p1 `$p2 `$p3 `$p4 `$bp\n";
	$row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f", $score, $p1, $p2, $p3, $p4, $bp);
	if (defined($in{SORT}) && $in{SORT} eq 'IBM') { $sort = $score; }
      }
      $row .= "</TD>\n";
    }
    else {
      $no_bleu=1;
    }
    # NIST score
    if (-e "$in{CORPUS}.ref.sgm" && -e "$in{CORPUS}.src.sgm" 
	&& !$DONTSCORE{$file}) {    
      $row .= "<TD>";
      print "$DONTSCORE{$file}+";
      my ($nist,$nist_bleu);
      if ($file =~ /sgm$/) {
	($nist,$nist_bleu) = get_nist_score("$in{CORPUS}.ref.sgm","$in{CORPUS}.src.sgm","$in{CORPUS}.$file");
	$row .= sprintf("<B>%.04f</B>",$nist);
	if ($in{SORT} eq 'NIST') { $sort = $nist; }
      }
      $row .= "</TD>\n";
      if ($no_bleu) {
	$row .= "<TD>";
	if ($file =~ /sgm$/) {
	  $row .= sprintf("<B>%.04f</B>",$nist_bleu);
	  if ($in{SORT} eq 'BLEU') { $sort = $nist_bleu; }
	}
	$row .= "</TD>\n";
      }
    }
    # multi-bleu
    if ($in{mBLEU} && (scalar keys %MEMORY) && -e "$in{CORPUS}.e") {
      $row .= "<TD>";
      if (!defined($DONTSCORE{$file}) && $file !~ /^f$/ && $file ne "e") {
	my ($score,$p1,$p2,$p3,$p4,$bp) = get_multi_bleu_score("$in{CORPUS}.f","$in{CORPUS}.e","$in{CORPUS}.$file");
	$row .= sprintf("<B>%.04f</B> %.01f/%.01f/%.01f/%.01f *%.03f",$score,$p1,$p2,$p3,$p4,$bp);
	if ($in{SORT} eq 'mBLEU') { $sort = $score; }
      }
      $row .= "</TD>\n";
    }
	 
	 my $isSystemOutput = ($file ne 'e' && $file ne 'f' && $file !~ /^pt/);
	 # misc stats (note the unknown words should come first so the total word count is available for WER)
	 $row .= "<TD align=\"center\">";
	 if($file eq 'f') #input
	 {
	 	try
		{
			my ($unknownCount, $totalCount) = calc_unknown_words($corpus, 'surf');
	 		$row .= sprintf("%.4lf (%d / %d)", $unknownCount / $totalCount, $unknownCount, $totalCount);
		}
		catch Error::Simple with {$row .= "[system error]";};
	 }
	 $row .= "</TD>\n<TD align=\"center\">";
	 if($file eq 'e' || $file eq 'f' || $isSystemOutput)
	 {
	 	try
		{
			my $perplexity = $corpus->calcPerplexity(($file eq 'e') ? 'truth' : (($file eq 'f') ? 'input' : $file), 'surf');
			$row .= sprintf("%.2lf", $perplexity);
		}
		catch Error::Simple with {$row .= "[system error]";}
	 }
	 $row .= "</TD>\n<TD align=\"center\">";
	 if($isSystemOutput)
	 {
	 	try
		{
			my $surfaceWER = $corpus->calcOverallWER($file);
			$row .= sprintf("%.4lf", $surfaceWER);
		}
		catch Error::Simple with {$row .= "[system error]";};
	 }
	 $row .= "</TD>\n<TD align=\"center\">";
	 my ($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER);
	 if($isSystemOutput)
	 {
		try
		{
			($nnAdjWER, $nnAdjPWER, $surfPWER, $lemmaPWER) = calc_misc_stats($corpus, $file);
			$row .= sprintf("WER = %.4lg<br>PWER = %.4lg<br><b>ratio = %.3lf</b>", $nnAdjWER, $nnAdjPWER, $nnAdjPWER / $nnAdjWER);
		}
		catch Error::Simple with {$row .= "[system error]";};
	}
	$row .= "</TD>\n<TD align=\"center\">";
	if($isSystemOutput)
	{
		if($surfPWER == -1)
		{
			$row .= "[system error]";
		}
		else
		{
			my ($lemmaBLEU, $p1, $p2, $p3, $p4, $brevity) = $corpus->calcBLEU($file, 'lemma');
			$row .= sprintf("surface = %.3lf<br>lemma = %.3lf<br><b>lemma BLEU = %.04f</b> %.01f/%.01f/%.01f/%.01f *%.03f", 
									$surfPWER, $lemmaPWER, $lemmaBLEU, $p1, $p2, $p3, $p4, $brevity);
		}
	}
	$row .= "</TD>\n<TD align=\"center\">";
	if($isSystemOutput)
	{
		try
		{
			my $testInfo = $corpus->statisticallyTestBLEUResults($file, 'surf');
			my @tTestPValues = @{$testInfo->[0]};
			my @confidenceIntervals = @{$testInfo->[1]};
			$row .= "n-gram precision p-values (high p <=> consistent score):<br>t test " . join("/", map {sprintf("%.4lf", $_)} @tTestPValues);
			$row .= "<p>n-gram precision 95% intervals:<br>" . join(",<br>", map {sprintf("[%.4lf - %.4lf]", $_->[0], $_->[1])} @confidenceIntervals);
			my @bleuInterval = (approxBLEUFromNgramScores(map {$_->[0]} @confidenceIntervals), approxBLEUFromNgramScores(map {$_->[1]} @confidenceIntervals));
			$row .= sprintf("<br><b>(BLEU: ~[%.4lf - %.4lf])</b>", $bleuInterval[0], $bleuInterval[1]);
		}
		catch Error::Simple with {$row .= "[system error]";}
	}
	$row .= "</TD>\n";

    # correct sentence score
    my($correct,$wrong,$unknown);
    $row .= "<TD>";
    if (!defined($DONTSCORE{$file}) && (scalar keys %MEMORY)) {
      my ($correct,$just_syn,$just_sem,$wrong,$unknown) = get_score_from_memory("$in{CORPUS}.$FOREIGN",
			       "$in{CORPUS}.$file");
      $row .= "<B><FONT COLOR=GREEN>$correct</FONT></B>";
      $row .= "/<FONT COLOR=ORANGE>$just_syn</FONT>";
      $row .= "/<FONT COLOR=ORANGE>$just_sem</FONT>";
      $row .= "/<FONT COLOR=RED>$wrong</FONT> ($unknown)</TD>\n";
      if ($in{SORT} eq 'SCORE') { 
	$sort = sprintf("%03d %04d",$correct,$just_syn+$just_sem);
      }
    }
	 else
	 {
	 	$row .= "</TD>\n";
	}

    $row .= "</TR>\n"; 
    push @TABLE, "<!-- $sort -->\n$row";
  }
  close(DIR);
  foreach (reverse sort @TABLE) { print $_; }
  print "</TABLE>\n";
  print "<INPUT TYPE=SUBMIT VALUE=\"Compare\">\n";
  print "<INPUT TYPE=CHECKBOX NAME=SURFACE VALUE=1 CHECKED> Compare all different sentences (instead of just differently <I>evaluated</I> sentences) <INPUT TYPE=CHECKBOX NAME=WITH_EVAL VALUE=1 CHECKED> with evaluation</FORM><P>\n";
  print "<P>The score is to be read as: <FONT COLOR=GREEN>correct</FONT>/<FONT COLOR=ORANGE>just-syn-correct</FONT>/<FONT COLOR=ORANGE>just-sem-correct</FONT>/<FONT COLOR=RED>wrong</FONT> (unscored)\n";

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -