📄 sentence-by-sentence.pl.svn-base
字号:
{ my $x = shift; return ($x - int($x) < .5) ? int($x) : int($x) + 1;}#escape HTML metacharacters for display purposes and to allow for consistent string comparison#arguments: string to be formatted in place#return: nonesub escapeMetachars{ my $str = shift; $str =~ s/&\s+/& /; $str =~ s/<\s+/< /; $str =~ s/>\s+/> /;}################################################################################################################################################################read one line from each of any number of filehandles#arguments: arrayref of filehandles, (empty) arrayref to be filled with read lines#return: 1 on success, 0 on failure (on failure the lines arrayref's value isn't defined)sub readLines{ my ($refFilehandles, $refLines) = @_; foreach my $fh (@$refFilehandles) { my $line = <$fh>; return 0 unless defined($line); push @$refLines, $line; } return 1;}#arguments: line read from corpus file#return: sentence (arrayref of arrayrefs of factor strings) taken from linesub extractFactorArrays{ my $line = shift; die "" if !defined $line; chomp $line; $line =~ s/^\s*|\s*$//g; #added by Ondrej to handle moses-mert-parallel output my @words = split(/\s+/, $line); my @factors = map {my @f = split(/\|/, $_); \@f;} @words; return \@factors;}#can handle multiple reference translations; assume at least one#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use#return: arrayref of [arrayref of [overall BLEU score, n-gram precisions], arrayref of matching n-gram [start index, length, arrayref of indices of matching references]]sub getBLEUSentenceDetails{ my $maxNgramOrder = 4; my ($refSysOutput, $refTruths, $factorIndex) = @_; my $length_translation = scalar(@$refSysOutput); #length of sysout sentence my @length_references = map {scalar(@$_)} @$refTruths; my $closestTruthLength = (sort(map {abs($_ - $length_translation)} @length_references))[0]; my @correct = (0) x $maxNgramOrder; #n-gram counts my @total = (0) x $maxNgramOrder; #n-gram counts my $returnData = [[], []]; my %REF_GRAM; #hash from n-gram to arrayref with # of times found in each reference my $ngramMatches = []; #arrayref of n-gram [start index, length] for(my $j = 0; $j < scalar(@$refTruths); $j++) { for(my $i = 0; $i < $length_references[$j]; $i++) { my $gram = ''; for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders { $gram = $refTruths->[$j]->[$i - $k]->[$factorIndex] . " " . $gram; #increment the count for the given n-gram and given reference number if(!exists $REF_GRAM{$gram}) { my @tmp = (0) x scalar @$refTruths; $tmp[$j] = 1; $REF_GRAM{$gram} = \@tmp; } else { $REF_GRAM{$gram}->[$j]++; } } } } for(my $i = 0; $i < $length_translation; $i++) { my $gram = ''; for(my $k = 0; $k < min($i + 1, $maxNgramOrder); $k++) #run over n-gram orders { $gram = $refSysOutput->[$i - $k]->[$factorIndex] . " " . $gram; if(exists $REF_GRAM{$gram}) #this n-gram was found in at least one reference { $correct[$k]++; my @indices = (); my $notOvercounting = 0; #make sure we don't 'match' against truth n-grams whose instances have all been used already for(my $m = 0; $m < scalar(@{$REF_GRAM{$gram}}); $m++) { if($REF_GRAM{$gram}->[$m] > 0) { push @indices, $m; $REF_GRAM{$gram}->[$m]--; $notOvercounting = 1; } } if($notOvercounting == 1) {push @$ngramMatches, [$i - $k, $k + 1, \@indices];} } } } my $brevity = ($length_translation > $closestTruthLength || $length_translation == 0) ? 1 : exp(1 - $closestTruthLength / $length_translation); my @pct; my ($logsum, $logcount) = (0, 0); for(my $i = 0; $i < $maxNgramOrder; $i++) { $total[$i] = max(1, $length_translation - $i); push @pct, ($total[$i] == 0) ? -1 : $correct[$i] / $total[$i]; if($total[$i] > 0) { $logsum += my_log($pct[$i]); $logcount++; } } my $bleu = $brevity * exp($logsum / $logcount); $returnData->[0] = [$bleu, @pct]; $returnData->[1] = $ngramMatches; return $returnData;}#can handle multiple sentence translations; assume at least one#arguments: sysout sentence (arrayref of arrayrefs of factor strings), truth sentences (arrayref of same), factor index to use#return: hashref of sysout word index => whether word matchessub getPWERSentenceDetails{ my ($refSysOutput, $refTruths, $factorIndex) = @_; my $matches = {}; my %truthWords; #hash from word to arrayref with number of times seen in each reference (but later holds only the max) for(my $i = 0; $i < scalar(@$refTruths); $i++) { foreach my $eWord (@{$refTruths->[$i]}) { my $factor = $eWord->[$factorIndex]; if(exists $truthWords{$factor}) {$truthWords{$factor}->[$i]++;} else {my @tmp = (0) x scalar(@$refTruths); $tmp[$i] = 1; $truthWords{$factor} = \@tmp;} } } %truthWords = map {$_ => maxN(@{$truthWords{$_}})} (keys %truthWords); #save only the max times each word is seen in a reference for(my $j = 0; $j < scalar(@$refSysOutput); $j++) { if(exists $truthWords{$refSysOutput->[$j]->[$factorIndex]} && $truthWords{$refSysOutput->[$j]->[$factorIndex]} > 0) { $truthWords{$refSysOutput->[$j]->[$factorIndex]}--; $matches->{$j} = 1; } else { $matches->{$j} = 0; } } return $matches;}#assign ranks to sentences by BLEU score#arguments: arrayref of arrayrefs of [sentence index, arrayref of [bleu score, n-gram precisions], rank to be assigned]#return: nonesub rankSentencesByBLEU{ my $bleuData = shift; my $i = 0; #sort first on score, then on 1-gram accuracy, then on sentence index foreach my $sentenceData (reverse sort {my $c = $a->[1]->[0] <=> $b->[1]->[0]; if($c == 0) {my $d = $a->[1]->[1] <=> $b->[1]->[1]; if($d == 0) {$a->[0] cmp $b->[0];} else {$d;}} else {$c;}} @$bleuData) {$sentenceData->[2] = $i++;}}################################################################################################################################################################write HTML for a sentence containing factors (display words in a row)#arguments: sentence (arrayref of arrayrefs of factor strings), PWER results (hashref from word indices to 0/1 whether matched a truth word)#return: HTML stringsub getFactoredSentenceHTML{ my $sentence = shift; my $pwer = 0; if(scalar(@_) > 0) {$pwer = shift;} my $html = "<table class=\"sentence_table\"><tr>"; for(my $i = 0; $i < scalar(@$sentence); $i++) #loop over words { my $style = ''; #default if($pwer ne '0' && $pwer->{$i} == 0) {$style = 'color: #cc0000; font-weight: bold';} $html .= "<td align=center style=\"$style\">" . join("<br>", @{$sentence->[$i]}) . "</td>"; } return $html . "</tr></table>";}#arguments: arrayref of [sentence index, arrayref of [bleu score, n-gram precisions], rank], number of sentences#return: HTML color stringsub getSentenceBGColorHTML{ my ($scoreData, $numSentences) = @_; my $tier = int($scoreData->[2] / ($numSentences / scalar(@htmlColors))); #0..n-1 return $htmlColors[$tier];}#display all matching n-grams in the given sentence, with all 1-grams on one line, then arranged by picking, for each, the first line on which it fits,# where a given word position can only be filled by one n-gram per line, so that all n-grams can be shown#arguments: sentence (arrayref of arrayrefs of factor strings), arrayref of arrayrefs of matching n-gram [start, length, arrayref of matching reference indices], # number of reference translations#return: HTML stringsub getAllNgramsHTML{ my ($sentence, $ngrams, $numTruths) = @_; my $factorIndex = 0; my @table = (); #array or arrayrefs each of which represents a line; each position has the index of the occupying n-gram, or -1 if none my $n = 0; #n-gram index foreach my $ngram (sort {$a->[0] <=> $b->[0]} @$ngrams) { #check for an open slot in an existing row my $foundRow = 0; my $r = 0; foreach my $row (@table) { if(rowIsClear($row, $ngram) == 1) { @{$row}[$ngram->[0] .. ($ngram->[0] + $ngram->[1] - 1)] = ($n) x $ngram->[1]; push @$ngram, $r; #add row index $foundRow = 1; last; } $r++; } #add row if necessary if($foundRow == 0) { my @row = (-1) x scalar(@$sentence); @row[$ngram->[0] .. ($ngram->[0] + $ngram->[1] - 1)] = ($n) x $ngram->[1]; push @$ngram, scalar(@table); #add row index push @table, \@row; } $n++; } my $html = "<table class=\"ngram_table\"><tr><td align=center>" . join("</td><td align=center>", map {$_->[$factorIndex]} @$sentence) . "</td></tr>"; my $numWords = scalar(@$sentence); my ($curRow, $curCol) = (0, 0); #address in table $html .= "<tr>"; foreach my $ngram (sort {my $c = $a->[3] <=> $b->[3]; if($c == 0) {$a->[0] <=> $b->[0]} else {$c}} @$ngrams) #sort by row, then word num { while($ngram->[0] > $curCol || $ngram->[3] > $curRow) {$html .= "<td></td>"; $curCol = ($curCol + 1) % $numWords; if($curCol == 0) {$html .= "</tr><tr>"; $curRow++;}} $html .= "<td colspan=" . $ngram->[1] . " align=center class=\"ngram_cell\" style=\"background: " . getNgramColorHTML(scalar(@{$ngram->[2]}), $numTruths) . "\">" . join(' ', map {$_->[$factorIndex]} @{$sentence}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) . "</td>"; $curCol = ($curCol + $ngram->[1]) % $numWords; if($curCol == 0) {$html .= "</tr><tr>"; $curRow++;} } $html .= "</tr>"; return $html . "</table>\n";}#auxiliary to getAllNgramsHTML(): check a table row for an empty piece at the right place for the given n-gram#arguments: row (arrayref of ints), n-gram (arrayref of [start index, length])#return: whether (0/1) row is clearsub rowIsClear{ my ($row, $ngram) = @_; return (maxN(@{$row}[$ngram->[0] .. $ngram->[0] + $ngram->[1] - 1]) == -1) ? 1 : 0;}#auxiliary to getAllNgramsHTML()#arguments: number of reference translations matching the n-gram, total number of references#return: HTML color stringsub getNgramColorHTML{ my ($matches, $total) = @_; if($total == 1) {return $ngramSingleRefColor;} return $ngramMultirefColors[round($matches / $total * (scalar(@ngramMultirefColors) - 1))];}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -