📄 corpus.pm.svn-base
字号:
push @$sentenceWER, $sentWER; push @$errIndices, $indices; } return ($totWER, $sentenceWER, $errIndices);}#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use#return: wer score, arrayref of arrayrefs of indices of errorful wordssub sentenceWER{ #constants: direction we came through the table my ($DIR_NONE, $DIR_SKIPTRUTH, $DIR_SKIPOUT, $DIR_SKIPBOTH) = (-1, 0, 1, 2); #values don't matter but must be unique my ($self, $refSysOutput, $refTruth, $index) = @_; my ($totWER, $indices) = (0, []); my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); if($sLength == 0 || $eLength == 0) {return ($totWER, $indices);} #special case my @refWordsMatchIndices = (-1) x $eLength; #at what sysout-word index this truth word is first matched my @sysoutWordsMatchIndices = (-1) x $sLength; #at what truth-word index this sysout word is first matched my $table = []; #index by sysout word index, then truth word index; a cell holds max count of matching words and direction we came to get it #dynamic-programming time: find the path through the table with the maximum number of matching words for(my $i = 0; $i < $sLength; $i++) { push @$table, []; for(my $j = 0; $j < $eLength; $j++) { my ($maxPrev, $prevDir) = (0, $DIR_NONE); if($i > 0 && $table->[$i - 1]->[$j]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j]->[0]; $prevDir = $DIR_SKIPOUT;} if($j > 0 && $table->[$i]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i]->[$j - 1]->[0]; $prevDir = $DIR_SKIPTRUTH;} if($i > 0 && $j > 0 && $table->[$i - 1]->[$j - 1]->[0] >= $maxPrev) {$maxPrev = $table->[$i - 1]->[$j - 1]->[0]; $prevDir = $DIR_SKIPBOTH;} my $match = ($refSysOutput->[$i]->[$index] eq $refTruth->[$j]->[$index] && $refWordsMatchIndices[$j] == -1 && $sysoutWordsMatchIndices[$i] == -1) ? 1 : 0; if($match == 1) {$refWordsMatchIndices[$j] = $i; $sysoutWordsMatchIndices[$i] = $j;} push @{$table->[$i]}, [($match ? $maxPrev + 1 : $maxPrev), $prevDir]; } } #look back along the path and get indices of non-matching words my @unusedSysout = (0) x $sLength; #whether each sysout word was matched--used for outputting html table my ($i, $j) = ($sLength - 1, $eLength - 1); while($i > 0) #work our way back to the first sysout word { push @{$table->[$i]->[$j]}, 0; #length is flag to highlight cell if($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH) { $j--; } elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT) { if($table->[$i - 1]->[$j]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;} $i--; } elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH) { if($table->[$i - 1]->[$j - 1]->[0] == $table->[$i]->[$j]->[0]) {unshift @$indices, $i; $unusedSysout[$i] = 1;} $i--; $j--; } } #we're at the first sysout word; finish up checking for matches while($j > 0 && $refWordsMatchIndices[$j] != 0) {push @{$table->[0]->[$j]}, 0; $j--;} if($j == 0 && $refWordsMatchIndices[0] != 0) {unshift @$indices, 0; $unusedSysout[0] = 1;} #no truth word was matched to the first sysout word #print some HTML to debug the WER algorithm# print "<table border=1><tr><td></td><td>" . join("</td><td>", map {() . $_->[$index]} @$refTruth) . "</td></tr>";# for(my $i = 0; $i < $sLength; $i++)# {# print "<tr><td" . (($unusedSysout[$i] == 1) ? " style=\"background-color: #ffdd88\">" : ">") . $refSysOutput->[$i]->[$index] . "</td>";# for(my $j = 0; $j < $eLength; $j++)# {# print "<td";# if(scalar(@{$table->[$i]->[$j]}) > 2) {print " style=\"color: yellow; background-color: #000080\"";}# my $arrow;# if($table->[$i]->[$j]->[1] == $DIR_NONE) {$arrow = "×";}# elsif($table->[$i]->[$j]->[1] == $DIR_SKIPTRUTH) {$arrow = "←";}# elsif($table->[$i]->[$j]->[1] == $DIR_SKIPOUT) {$arrow = "↑";}# elsif($table->[$i]->[$j]->[1] == $DIR_SKIPBOTH) {$arrow = "◊";}# print ">" . $table->[$i]->[$j]->[0] . " " . $arrow . "</td>";# }# print "</tr>";# }# print "</table>"; my $matchCount = 0; if($sLength > 0) {$matchCount = $table->[$sLength - 1]->[$eLength - 1]->[0];} return ($sLength - $matchCount, $indices);}#arguments: system output (arrayref of arrayrefs of arrayrefs of factor strings), truth (same), factor index to use#return: wer score, arrayref of sentence scores, arrayref of arrayrefs of indices of errorful wordssub corpusPWER{ my ($self, $refSysOutput, $refTruth, $index) = @_; my ($totWER, $sentenceWER, $errIndices) = (0, [], []); for(my $i = 0; $i < scalar(@$refSysOutput); $i++) { my ($sentWER, $indices) = $self->sentencePWER($refSysOutput->[$i], $refTruth->[$i], $index); $totWER += $sentWER; push @$sentenceWER, $sentWER; push @$errIndices, $indices; } return ($totWER, $sentenceWER, $errIndices);}#arguments: system output (arrayref of arrayrefs of factor strings), truth (same), factor index to use#return: wer score, arrayref of arrayrefs of indices of errorful wordssub sentencePWER{ my ($self, $refSysOutput, $refTruth, $index) = @_; my ($totWER, $indices) = (0, []); my ($sLength, $eLength) = (scalar(@$refSysOutput), scalar(@$refTruth)); my @truthWordUsed = (0) x $eLength; #array of 0/1; can only match a given truth word once for(my $j = 0; $j < $sLength; $j++) { my $found = 0; for(my $k = 0; $k < $eLength; $k++) #check output word against entire truth sentence { if(lc $refSysOutput->[$j]->[$index] eq lc $refTruth->[$k]->[$index] && $truthWordUsed[$k] == 0) { $truthWordUsed[$k] = 1; $found = 1; last; } } if($found == 0) { $totWER++; push @$indices, $j; } } return ($totWER, $indices);}#BLEU calculation for a single sentence#arguments: truth sentence (arrayref of arrayrefs of factor strings), sysout sentence (same), factor index to use#return: 1- through 4-gram matching and total counts (1-g match, 1-g tot, 2-g match...), candidate length, reference lengthsub sentenceBLEU{ my ($self, $refTruth, $refSysOutput, $factorIndex, $debug) = @_; my ($length_reference, $length_translation) = (scalar(@$refTruth), scalar(@$refSysOutput)); my ($correct1, $correct2, $correct3, $correct4, $total1, $total2, $total3, $total4) = (0, 0, 0, 0, 0, 0, 0, 0); my %REF_GRAM = (); my ($i, $gram); for($i = 0; $i < $length_reference; $i++) { $gram = $refTruth->[$i]->[$factorIndex]; $REF_GRAM{$gram}++; next if $i<1; $gram = $refTruth->[$i - 1]->[$factorIndex] ." ".$gram; $REF_GRAM{$gram}++; next if $i<2; $gram = $refTruth->[$i - 2]->[$factorIndex] ." ".$gram; $REF_GRAM{$gram}++; next if $i<3; $gram = $refTruth->[$i - 3]->[$factorIndex] ." ".$gram; $REF_GRAM{$gram}++; } for($i = 0; $i < $length_translation; $i++) { $gram = $refSysOutput->[$i]->[$factorIndex]; if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { $REF_GRAM{$gram}--; $correct1++; } next if $i<1; $gram = $refSysOutput->[$i - 1]->[$factorIndex] ." ".$gram; if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { $REF_GRAM{$gram}--; $correct2++; } next if $i<2; $gram = $refSysOutput->[$i - 2]->[$factorIndex] ." ".$gram; if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { $REF_GRAM{$gram}--; $correct3++; } next if $i<3; $gram = $refSysOutput->[$i - 3]->[$factorIndex] ." ".$gram; if (defined($REF_GRAM{$gram}) && $REF_GRAM{$gram} > 0) { $REF_GRAM{$gram}--; $correct4++; } } my $total = $length_translation; $total1 = max(1, $total); $total2 = max(1, $total - 1); $total3 = max(1, $total - 2); $total4 = max(1, $total - 3); return ($correct1, $total1, $correct2, $total2, $correct3, $total3, $correct4, $total4, $length_translation, $length_reference);}##### filesystem ######open as many given files as possible; only warn about the rest#arguments: list of filename extensions to open (assume corpus name is file title)#return: hash from type string to filehandleref, giving all files that were successfully openedsub openFiles{ my ($self, @extensions) = @_; my %openedFiles = (); foreach my $ext (@extensions) { if(!open(FILE, "<" . $self->{'corpusName'} . $ext)) { warn "Corpus::openFiles(): couldn't open '" . $self->{'corpusName'} . $ext . "' for read\n"; } else #success { $openedFiles{$ext} = \*FILE; } } return %openedFiles;}#read one line from each given file#arguments: hash from type string to filehandleref#return: hash from type string to sentence (stored as arrayref of arrayrefs of factors) read from corresponding filesub readLineFromFiles{ my ($self, %openedFiles) = @_; my %lines; foreach my $type (keys %openedFiles) { $lines{$type} = []; my $sentence = <$openedFiles{$type}>; my @words = split(/\s+/, $sentence); foreach my $word (@words) { my @factors = split(/\|/, $word); push @{$lines{$type}}, \@factors; } } return %lines;}#close all given files#arguments: hash from type string to filehandleref#return: nonesub closeFiles{ my ($self, %openedFiles) = @_; foreach my $type (keys %openedFiles) { close($openedFiles{$type}); }}##### write HTML ######print HTML for comparing various versions of a sentence, with special processing for each version as appropriate#arguments: filehandleref to which to write, sentence ID string, hashref of version string to sentence (stored as arrayref of arrayref of factor strings)#return: nonesub printSingleSentenceComparison{ my ($self, $fh, $sentID, $sentences) = @_; my $curFH = select; select $fh; #javascript to reorder rows to look nice afterward print "<script type=\"text/javascript\"> function reorder_$sentID() {/* var table = document.getElementById('div_$sentID').firstChild; var refTransRow = table.getElementById('row_e'); var inputRow = table.getElementById('row_f'); table.removeRow(refTransRow); table.removeRow(inputRow); var newRow1 = table.insertRow(0); var newRow2 = table.insertRow(1); newRow1.childNodes = inputRow.childNodes; newRow2.childNodes = refTransRow.childNodes;*/ } </script>"; #html for sentences print "<div id=\"div_$sentID\" style=\"padding: 3px; margin: 5px\">"; print "<table border=\"1\">";# my $rowCount = 0;# my @bgColors = ("#ffefbf", "#ffdf7f"); #process all rows in order foreach my $sentType (keys %$sentences) { my $bgcolor = $bgColors[$rowCount % 2]; print "<tr id=\"row_$sentType\"><td align=right>"; #description of sentence if(defined($self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType})) { print "(" . $self->{'fileDescriptions'}->{$self->{'corpusName'} . $sentType} . ")"; } else { print "($sentType)"; } print "</td><td align=left>"; #sentence with markup if($sentType eq 'f') #input {# $self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $inputColor); } elsif($sentType eq 'e') #reference translation {# $self->writeHTMLSentenceWithFactors($fh, $sentences->{$sentType}, $reftransColor); } else #system output {# $self->writeHTMLTranslationHighlightedWithFactors($fh, $sentences->{$sentType}, $sentences->{'e'}, $highlightColors); } print "</td></tr>";# $rowCount++; } print "</table>"; print "</div>\n"; select $curFH;}#print contents of all fields of this object, with useful formatting for arrayrefs and hashrefs#arguments: none#return: nonesub printDetails{ my $self = shift; foreach my $key (keys %$self) { if(ref($self->{$key}) eq 'HASH') { print STDERR "obj: $key => {" . join(', ', map {"$_ => " . $self->{$key}->{$_}} (keys %{$self->{$key}})) . "}\n"; } elsif(ref($self->{$key}) eq 'ARRAY') { print STDERR "obj: $key => (" . join(', ', @{$self->{$key}}) . ")\n"; } elsif(ref($self->{$key}) eq '') #not a reference { print STDERR "obj: $key => " . $self->{$key} . "\n"; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -