📄 corpus.pm
字号:
push @$sentenceWER, $sentWER;
push @$errIndices, $indices;
}
return ($totWER, $sentenceWER, $errIndices);
}
#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use
#return: wer score, arrayref of arrayrefs of indices of errorful words
sub sentenceWER
{
#constants: direction we came through the table
my ($DIR_NONE, $DIR_SKIPTRUTH, $DIR_SKIPOUT, $DIR_SKIPBOTH) = (-1, 0, 1, 2); #values don't matter but must be unique
my ($self, $refSysOutput, $refTruth, $index) = @_;
my ($totWER, $indices) = (0, []);
my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth));
if($sLength == 0 || $eLength == 0) {return ($totWER, $indices);} #special case
my @refWordsMatchIndices = (-1) x $eLength; #at what sysout-word index this truth word is first matched
my @sysoutWordsMatchIndices = (-1) x $sLength; #at what truth-word index this sysout word is first matched
my $table = []; #index by sysout word index, then truth word index; a cell holds max count of matching words and direction we came to get it
#dynamic-programming time: find the path through the table with the maximum number of matching words
for(my $i = 0; $i < $sLength; $i++)
{
push @$table, [];
for(my $j = 0; $j < $eLength; $j++)
{
my ($maxPrev, $prevDir) = (0, $DIR_NONE);
if($i > 0 && $table->[$i - 1]->[$j]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j]->[0]; $prevDir = $DIR_SKIPOUT;}
if($j > 0 && $table->[$i]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i]->[$j - 1]->[0]; $prevDir = $DIR_SKIPTRUTH;}
if($i > 0 && $j > 0 && $table->[$i - 1]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j - 1]->[0]; $prevDir = $DIR_SKIPBOTH;}
my $match = ($refSysOutput->[$i]->[$index] eq $refTruth->[$j]->[$index] && $refWordsMatchIndices[$j] == -1 && $sysoutWordsMatchIndices[$i] == -1) ? 1 : 0;
if($match == 1) {$refWordsMatchIndices[$j] = $i; $sysoutWordsMatchIndices[$i] = $j;}
push @{$table->[$i]}, [($match ? $maxPrev + 1 : $maxPrev), $prevDir];
}
}
#look back along the path and get indices of non-matching words
my @unusedSysout = (0) x $sLength; #whether each sysout word was matched--used for outputting html table
my ($i, $j) = ($sLength - 1, $eLength - 1);
while($i > 0) #work our way back to the first sysout word
{
push @{$table->[$i]->[$j]}, 0; #length is flag to highlight cell
if($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH)
{
$j--;
}
elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT)
{
if($table->[$i - 1]->[$j]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;}
$i--;
}
elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH)
{
if($table->[$i - 1]->[$j - 1]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;}
$i--; $j--;
}
}
#we're at the first sysout word; finish up checking for matches
while($j > 0 && $refWordsMatchIndices[$j] != 0) {push @{$table->[0]->[$j]}, 0; $j--;}
if($j == 0 && $refWordsMatchIndices[0] != 0) {unshift @$indices, 0; $unusedSysout[0] = 1;} #no truth word was matched to the first sysout word
#print some HTML to debug the WER algorithm
# print "<table border=1><tr><td></td><td>" . join("</td><td>", map {() . $_->[$index]} @$refTruth) . "</td></tr>";
# for(my $i = 0; $i < $sLength; $i++)
# {
# print "<tr><td" . (($unusedSysout[$i] == 1) ? " style=\"background-color: #ffdd88\">" : ">") . $refSysOutput->[$i]->[$index] . "</td>";
# for(my $j = 0; $j < $eLength; $j++)
# {
# print "<td";
# if(scalar(@{$table->[$i]->[$j]}) > 2) {print " style=\"color: yellow; background-color: #000080\"";}
# my $arrow;
# if($table->[$i]->[$j]->[1] == $DIR_NONE) {$arrow = "×";}
# elsif($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH) {$arrow = "←";}
# elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT) {$arrow = "↑";}
# elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH) {$arrow = "◊";}
# print ">" . $table->[$i]->[$j]->[0] . " " . $arrow . "</td>";
# }
# print "</tr>";
# }
# print "</table>";
my $matchCount = 0;
if($sLength > 0) {$matchCount = $table->[$sLength - 1]->[$eLength - 1]->[0];}
return ($sLength - $matchCount, $indices);
}
#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use
#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful words
sub corpusPWER
{
my ($self, $refSysOutput, $refTruth, $index) = @_;
my ($totWER, $sentenceWER, $errIndices) = (0, [], []);
for(my $i = 0; $i < scalar(@$refSysOutput); $i++)
{
my ($sentWER, $indices) = $self->sentencePWER($refSysOutput->[$i], $refTruth->[$i], $index);
$totWER += $sentWER;
push @$sentenceWER, $sentWER;
push @$errIndices, $indices;
}
return ($totWER, $sentenceWER, $errIndices);
}
#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use
#return: wer score, arrayref of arrayrefs of indices of errorful words
sub sentencePWER
{
my ($self, $refSysOutput, $refTruth, $index) = @_;
my ($totWER, $indices) = (0, []);
my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth));
my @truthWordUsed = (0) x $eLength; #array of 0/1; can only match a given truth word once
for(my $j = 0; $j < $sLength; $j++)
{
my $found = 0;
for(my $k = 0; $k < $eLength; $k++) #check output word against entire truth sentence
{
if(lc $refSysOutput->[$j]->[$index] eq lc $refTruth->[$k]->[$index] && $truthWordUsed[$k] == 0)
{
$truthWordUsed[$k] = 1;
$found = 1;
last;
}
}
if($found == 0)
{
$totWER++;
push @$indices, $j;
}
}
return ($totWER, $indices);
}
#BLEU calculation for a single sentence
#arguments: truth sentence (arrayref of arrayrefs of factor strings), sysout sentence (same), factor index to use
#return: 1- through 4-gram matching and total counts (1-g match, 1-g tot, 2-g match...), candidate length, reference length
sub sentenceBLEU
{
my ($self, $refTruth, $refSysOutput, $factorIndex, $debug) = @_;
my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput));
my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0);
my %REF_GRAM = ();
my ($i, $gram);
for($i = 0; $i < $length_reference; $i++)
{
$gram = $refTruth->[$i]->[$factorIndex];
$REF_GRAM{$gram}++;
next if $i<1;
$gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram;
$REF_GRAM{$gram}++;
next if $i<2;
$gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram;
$REF_GRAM{$gram}++;
next if $i<3;
$gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram;
$REF_GRAM{$gram}++;
}
for($i = 0; $i < $length_translation; $i++)
{
$gram = $refSysOutput->[$i]->[$factorIndex];
if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {
$REF_GRAM{$gram}--;
$correct1++;
}
next if $i<1;
$gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram;
if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {
$REF_GRAM{$gram}--;
$correct2++;
}
next if $i<2;
$gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram;
if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {
$REF_GRAM{$gram}--;
$correct3++;
}
next if $i<3;
$gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram;
if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) {
$REF_GRAM{$gram}--;
$correct4++;
}
}
my $total = $length_translation;
$total1 = max(1, $total);
$total2 = max(1, $total - 1);
$total3 = max(1, $total - 2);
$total4 = max(1, $total - 3);
return ($correct1, $total1, $correct2, $total2, $correct3, $total3, $correct4, $total4, $length_translation, $length_reference);
}
##### filesystem #####
#open as many given files as possible; only warn about the rest
#arguments: list of filename extensions to open (assume corpus name is file title)
#return: hash from type string to filehandleref, giving all files that were successfully opened
sub openFiles
{
my ($self, @extensions) = @_;
my %openedFiles = ();
foreach my $ext (@extensions)
{
if(!open(FILE, "<" . $self->{'corpusName'} . $ext))
{
warn "Corpus::openFiles(): couldn't open '" . $self->{'corpusName'} . $ext . "' for read\n";
}
else #success
{
$openedFiles{$ext} = \*FILE;
}
}
return %openedFiles;
}
#read one line from each given file
#arguments: hash from type string to filehandleref
#return: hash from type string to sentence (stored as arrayref of arrayrefs of factors) read from corresponding file
sub readLineFromFiles
{
my ($self, %openedFiles) = @_;
my %lines;
foreach my $type (keys %openedFiles)
{
$lines{$type} = [];
my $sentence = <$openedFiles{$type}>;
my @words = split(/\s+/, $sentence);
foreach my $word (@words)
{
my @factors = split(/\|/, $word);
push @{$lines{$type}}, \@factors;
}
}
return %lines;
}
#close all given files
#arguments: hash from type string to filehandleref
#return: none
sub closeFiles
{
my ($self, %openedFiles) = @_;
foreach my $type (keys %openedFiles)
{
close($openedFiles{$type});
}
}
##### write HTML #####
#print HTML for comparing various versions of a sentence, with special processing for each version as appropriate
#arguments: filehandleref to which to write, sentence ID string, hashref of version string to sentence (stored as arrayref of arrayref of factor strings)
#return: none
sub printSingleSentenceComparison
{
my ($self, $fh, $sentID, $sentences) = @_;
my $curFH = select;
select $fh;
#javascript to reorder rows to look nice afterward
print "<script type=\"text/javascript\">
function reorder_$sentID()
{/*
var table = document.getElementById('div_$sentID').firstChild;
var refTransRow = table.getElementById('row_e');
var inputRow = table.getElementById('row_f');
table.removeRow(refTransRow);
table.removeRow(inputRow);
var newRow1 = table.insertRow(0);
var newRow2 = table.insertRow(1);
newRow1.childNodes = inputRow.childNodes;
newRow2.childNodes = refTransRow.childNodes;*/
}
</script>";
#html for sentences
print "<div id=\"div_$sentID\" style=\"padding: 3px; margin: 5px\">";
print "<table border=\"1\">";
# my $rowCount = 0;
# my @bgColors = ("#ffefbf", "#ffdf7f");
#process all rows in order
foreach my $sentType (keys %$sentences)
{
my $bgcolor = $bgColors[$rowCount % 2];
print "<tr id=\"row_$sentType\"><td align=right>";
#description of sentence
if(defined($self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType}))
{
print "(" . $self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType} . ")";
}
else
{
print "($sentType)";
}
print "</td><td align=left>";
#sentence with markup
if($sentType eq 'f') #input
{
# $self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $inputColor);
}
elsif($sentType eq 'e') #reference translation
{
# $self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $reftransColor);
}
else #system output
{
# $self->writeHTMLTranslationHighlightedWithFactors($fh, $sentences->{$sentType}, $sentences->{'e'}, $highlightColors);
}
print "</td></tr>";
# $rowCount++;
}
print "</table>";
print "</div>\n";
select $curFH;
}
#print contents of all fields of this object, with useful formatting for arrayrefs and hashrefs
#arguments: none
#return: none
sub printDetails
{
my $self = shift;
foreach my $key (keys %$self)
{
if(ref($self->{$key}) eq 'HASH')
{
print STDERR "obj: $key => {" . join(', ', map {"$_ => " . $self->{$key}->{$_}} (keys %{$self->{$key}})) . "}\n";
}
elsif(ref($self->{$key}) eq 'ARRAY')
{
print STDERR "obj: $key => (" . join(', ', @{$self->{$key}}) . ")\n";
}
elsif(ref($self->{$key}) eq '') #not a reference
{
print STDERR "obj: $key => " . $self->{$key} . "\n";
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -