📄 train-factored-phrase-model.perl.svn-base
字号:
$factor_f); &reduce_factors($___CORPUS.".".$___E.$___CORPUS_COMPRESSION, $___MODEL_DIR."/aligned.".$factor_e.".".$___E, $factor_e); &get_lexical(); }}sub get_lexical { print STDERR "(4) [$factor] generate lexical translation table @ ".`date`; my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH); if (-e "$___LEXICAL_DIR/lex.$factor.f2n" && -e "$___LEXICAL_DIR/lex.$factor.n2f") { print STDERR " reusing: $___LEXICAL_DIR/lex.$factor.f2n and $___LEXICAL_DIR/lex.$factor.n2f\n"; return; } &open_alignment(); while(my $e = <E>) { if (($alignment_id++ % 1000) == 0) { print STDERR "!"; } chomp($e); my @ENGLISH = split(/ /,$e); my $f = <F>; chomp($f); my @FOREIGN = split(/ /,$f); my $a = <A>; chomp($a); my (%FOREIGN_ALIGNED,%ENGLISH_ALIGNED); foreach (split(/ /,$a)) { my ($fi,$ei) = split(/\-/); if ($fi >= scalar(@FOREIGN) || $ei >= scalar(@ENGLISH)) { print STDERR "alignment point ($fi,$ei) out of range (0-$#FOREIGN,0-$#ENGLISH) in line $alignment_id, ignoring\n"; } else { # local counts $FOREIGN_ALIGNED{$fi}++; $ENGLISH_ALIGNED{$ei}++; # global counts $WORD_TRANSLATION{$FOREIGN[$fi]}{$ENGLISH[$ei]}++; $TOTAL_FOREIGN{$FOREIGN[$fi]}++; $TOTAL_ENGLISH{$ENGLISH[$ei]}++; } } # unaligned words for(my $ei=0;$ei<scalar(@ENGLISH);$ei++) { next if defined($ENGLISH_ALIGNED{$ei}); $WORD_TRANSLATION{"NULL"}{$ENGLISH[$ei]}++; $TOTAL_ENGLISH{$ENGLISH[$ei]}++; $TOTAL_FOREIGN{"NULL"}++; } for(my $fi=0;$fi<scalar(@FOREIGN);$fi++) { next if defined($FOREIGN_ALIGNED{$fi}); $WORD_TRANSLATION{$FOREIGN[$fi]}{"NULL"}++; $TOTAL_FOREIGN{$FOREIGN[$fi]}++; $TOTAL_ENGLISH{"NULL"}++; } } &close_alignment(); &save_word_translation(\%WORD_TRANSLATION,\%TOTAL_FOREIGN,\%TOTAL_ENGLISH);}sub open_alignment { open(E,"$___MODEL_DIR/aligned.$factor_e.$___E") or die "Can't read $___MODEL_DIR/aligned.$factor_e.$___E"; open(F,"$___MODEL_DIR/aligned.$factor_f.$___F") or die "Can't read $___MODEL_DIR/aligned.$factor_f.$___F"; open(A,"$___MODEL_DIR/aligned.$___ALIGNMENT") or die "Can't read $___MODEL_DIR/aligned.$___ALIGNMENT"; $alignment_id=0;}sub close_alignment { print STDERR "\n"; close(A); close(F); close(E);}sub save_word_translation { my ($WORD_TRANSLATION,$TOTAL_FOREIGN,$TOTAL_ENGLISH) = @_; safesystem("mkdir -p $___LEXICAL_DIR") or die; open(F2E,">$___LEXICAL_DIR/lex.$factor.f2n") or die "Can't write $___LEXICAL_DIR/lex.$factor.f2n"; open(E2F,">$___LEXICAL_DIR/lex.$factor.n2f") or die "Can't write $___LEXICAL_DIR/lex.$factor.n2f"; foreach my $f (keys %{$WORD_TRANSLATION}) { foreach my $e (keys %{$$WORD_TRANSLATION{$f}}) { printf F2E "%s %s %.7f\n",$e,$f,$$WORD_TRANSLATION{$f}{$e}/$$TOTAL_FOREIGN{$f}; printf E2F "%s %s %.7f\n",$f,$e,$$WORD_TRANSLATION{$f}{$e}/$$TOTAL_ENGLISH{$e}; } } close(E2F); close(F2E); print STDERR "Saved: $___LEXICAL_DIR/lex.$factor.f2n and $___LEXICAL_DIR/lex.$factor.n2f\n";}### (5) PHRASE EXTRACTIONsub extract_phrase_factored { print STDERR "(5) extract phrases @ ".`date`; my %generated; foreach my $f (split(/\+/,"$___TRANSLATION_FACTORS" .($REORDERING_LEXICAL ? "+$___REORDERING_FACTORS" : ""))) { # we extract phrases for all translation steps and also for reordering factors (if lexicalized reordering is used) next if $generated{$f}; $generated{$f} = 1; $factor = $f; ($factor_f,$factor_e) = split(/\-/,$factor); &extract_phrase(); }}sub extract_phrase { print STDERR "(5) [$factor] extract phrases @ ".`date`; my $cmd = "$PHRASE_EXTRACT $___MODEL_DIR/aligned.$factor_e.$___E $___MODEL_DIR/aligned.$factor_f.$___F $___MODEL_DIR/aligned.$___ALIGNMENT $___EXTRACT_FILE.$factor $___MAX_PHRASE_LENGTH orientation"; print STDERR "$cmd\n"; safesystem("$cmd") or die "Phrase extraction failed (missing input files?)"; safesystem("cat $___EXTRACT_FILE.$factor.o.part* > $___EXTRACT_FILE.$factor.o") or die; safesystem("rm -f $___EXTRACT_FILE.$factor.o.gz") or die; safesystem("gzip $___EXTRACT_FILE.$factor.o") or die; if (! $debug) { safesystem("rm -f $___EXTRACT_FILE.$factor.o.part*") or die;} safesystem("cat $___EXTRACT_FILE.$factor.part* > $___EXTRACT_FILE.$factor") or die; if (! $debug) { safesystem("rm -f $___EXTRACT_FILE.$factor.part*") or die;} safesystem("cat $___EXTRACT_FILE.$factor.inv.part* > $___EXTRACT_FILE.$factor.inv") or die; if (! $debug) { safesystem("rm -f $___EXTRACT_FILE.$factor.inv.part*") or die;}}### (6) PHRASE SCORINGsub score_phrase_factored { print STDERR "(6) score phrases @ ".`date`; foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { $factor = $f; ($factor_f,$factor_e) = split(/\-/,$factor); &score_phrase(); }}sub score_phrase { print STDERR "(6) [$factor] score phrases @ ".`date`; if (-e "$___EXTRACT_FILE.$factor.gz") { safesystem("gunzip < $___EXTRACT_FILE.$factor.gz > $___EXTRACT_FILE.$factor") or die; } if (-e "$___EXTRACT_FILE.$factor.inv.gz") { safesystem("gunzip < $___EXTRACT_FILE.$factor.inv.gz > $___EXTRACT_FILE.$factor.inv") or die; } print STDERR "(6.1) [$factor] sorting @ ".`date`; # print "LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor > $___EXTRACT_FILE.$factor.sorted\n"; safesystem("LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor > $___EXTRACT_FILE.$factor.sorted") or die; safesystem("rm -f $___EXTRACT_FILE.$factor.gz") or die; safesystem("gzip $___EXTRACT_FILE.$factor") or die; print STDERR "(6.2) [$factor] sorting inv @ ".`date`; # print "LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor.inv > $___EXTRACT_FILE.$factor.inv.sorted\n"; safesystem("LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor.inv > $___EXTRACT_FILE.$factor.inv.sorted") or die; safesystem("rm -f $___EXTRACT_FILE.$factor.inv.gz") or die; safesystem("gzip $___EXTRACT_FILE.$factor.inv") or die; for my $direction ("f2n","n2f") { print STDERR "(6.3) [$factor] creating table half $___MODEL_DIR/phrase-table-half.$factor.$direction @ ".`date`; my $extract = "$___EXTRACT_FILE.$factor.sorted"; $extract = "$___EXTRACT_FILE.$factor.inv.sorted" if $direction eq "n2f"; my $inverse = ""; $inverse = " inverse" if $direction eq "n2f"; my $part_count = &split_extract($extract); for(my $i=0;$i<$part_count;$i++) { my $part = sprintf("%04d",$i); print "$PHRASE_SCORE $extract.part$part $___LEXICAL_DIR/lex.$factor.$direction $___MODEL_DIR/phrase-table-half.$factor.$direction.part$part $inverse\n"; safesystem("$PHRASE_SCORE $extract.part$part $___LEXICAL_DIR/lex.$factor.$direction $___MODEL_DIR/phrase-table-half.$factor.$direction.part$part $inverse") or die "Scoring of phrases failed"; if (! $debug) { safesystem("rm $extract.part$part") or die;} } safesystem("cat $___MODEL_DIR/phrase-table-half.$factor.$direction.part* >$___MODEL_DIR/phrase-table-half.$factor.$direction") or die; } print STDERR "(6.4) [$factor] sorting inverse n2f table@ ".`date`; safesystem("LC_ALL=C sort -T $___MODEL_DIR $___MODEL_DIR/phrase-table-half.$factor.n2f > $___MODEL_DIR/phrase-table-half.$factor.n2f.sorted") or die; print STDERR "(6.5) [$factor] consolidating the two halves @ ".`date`; open(F2N,"$___MODEL_DIR/phrase-table-half.$factor.f2n") or die "Can't read $___MODEL_DIR/phrase-table-half.$factor.f2n"; open(N2F,"$___MODEL_DIR/phrase-table-half.$factor.n2f.sorted") or die "Can't read $___MODEL_DIR/phrase-table-half.$factor.n2f.sorted"; open(TABLE,">$___MODEL_DIR/phrase-table.$factor") or die "Can't write $___MODEL_DIR/phrase-table.$factor"; my $i=0; my $mismatch = 0; while(my $f2n = <F2N>) { $i++; my $n2f = <N2F>; my ($english,$foreign,$p) = split(/ \|\|\| /,$n2f); chop($p); my ($english2,$foreign2,$p2) = split(/ \|\|\| /,$f2n); chop($p2); if ($english ne $english2 || $foreign ne $foreign2) { print STDERR "mismatch line $i: ($english ne $english2 || $foreign ne $foreign2)\n"; $mismatch++; last if $mismatch > 10; next; } print TABLE "$english ||| $foreign ||| $p $p2 2.718\n"; } close(N2F); close(F2N); die "There were mismatches! (printed only first 10)" if $mismatch; if (! $debug) { safesystem("rm -f $___MODEL_DIR/phrase-table-half.$factor.*") or die;} if (! $debug) { safesystem("rm -f $___MODEL_DIR/extract*sorted*") or die;} safesystem("rm -f $___MODEL_DIR/phrase-table.$factor.gz") or die; safesystem("gzip $___MODEL_DIR/phrase-table.$factor") or die;}sub split_extract { my ($file) = @_; my $i=0; my $part = 1; my $split_when_possible = 0; my ($first,$dummy); my $partfname = sprintf("%s.part%04d",$file,0); open(PART,">$partfname") or die "Can't write $partfname"; open(EXTRACT,$file) or die "Can't read $file"; while(<EXTRACT>) { if ($i>0 && $i % 10000000 == 0) { $split_when_possible = 1; ($first,$dummy) = split(/ \|\|\| /); } elsif ($split_when_possible) { my ($f,$dummy) = split(/ \|\|\| /); if ($f ne $first) { close(PART) if $i; my $partfname = sprintf("%s.part%04d",$file,$part); open(PART,">$partfname") or die "Can't write $partfname"; $split_when_possible = 0; $part++; } } print PART $_; $i++; } close(EXTRACT); return $part;}### (7) LEARN REORDERING MODELsub get_reordering_factored { print STDERR "(7) learn reordering model @ ".`date`; if ($REORDERING_LEXICAL) { foreach my $f (split(/\+/,$___REORDERING_FACTORS)) { $factor = $f; ($factor_f,$factor_e) = split(/\-/,$factor); &get_reordering(); } } else { print STDERR " ... skipping this step, reordering is not lexicalized ...\n"; }}sub get_reordering { print STDERR "(7) [$factor] learn reordering model @ ".`date`; print STDERR "(7.1) [$factor] sorting extract.o @ ".`date`; if (-e "$___EXTRACT_FILE.$factor.o.gz") { safesystem("gunzip $___EXTRACT_FILE.$factor.o.gz") or die; } # print "LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor.o > $___EXTRACT_FILE.$factor.o.sorted\n"; safesystem("LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor.o > $___EXTRACT_FILE.$factor.o.sorted") or die; safesystem("rm -f $___EXTRACT_FILE.$factor.o.gz") or die; safesystem("gzip $___EXTRACT_FILE.$factor.o") or die; my $smooth = $___REORDERING_SMOOTH; my @REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth); my @REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth); my (%SMOOTH_PREVIOUS,%SMOOTH_FOLLOWING); if ($smooth =~ /(.+)u$/) { $smooth = $1; my $smooth_total = 0; open(O,"$___EXTRACT_FILE.$factor.o.sorted") or die "Can't read $___EXTRACT_FILE.$factor.o.sorted"; while(<O>) { chomp; my ($f,$e,$o) = split(/ \|\|\| /); my ($o_previous,$o_following) = split(/ /,$o); $SMOOTH_PREVIOUS{$o_previous}++; $SMOOTH_FOLLOWING{$o_following}++; $smooth_total++; } close(O); @REORDERING_SMOOTH_PREVIOUS = ($smooth*($SMOOTH_PREVIOUS{"mono"}+0.1)/$smooth_total, $smooth*($SMOOTH_PREVIOUS{"swap"}+0.1)/$smooth_total, $smooth*($SMOOTH_PREVIOUS{"other"}+0.1)/$smooth_total); @REORDERING_SMOOTH_FOLLOWING = ($smooth*($SMOOTH_FOLLOWING{"mono"}+0.1)/$smooth_total, $smooth*($SMOOTH_FOLLOWING{"swap"}+0.1)/$smooth_total, $smooth*($SMOOTH_FOLLOWING{"other"}+0.1)/$smooth_total); printf "$smooth*($SMOOTH_FOLLOWING{mono}+0.1)/$smooth_total, $smooth*($SMOOTH_FOLLOWING{swap}+0.1)/$smooth_total, $smooth*($SMOOTH_FOLLOWING{other}+0.1)/$smooth_total\n"; printf "smoothed following to %f,%f,%f\n",@REORDERING_SMOOTH_FOLLOWING; } ($mono_previous_f,$swap_previous_f,$other_previous_f) = @REORDERING_SMOOTH_PREVIOUS; ($mono_previous_fe,$swap_previous_fe,$other_previous_fe) = @REORDERING_SMOOTH_PREVIOUS; ($mono_following_f,$swap_following_f,$other_following_f) = @REORDERING_SMOOTH_FOLLOWING; ($mono_following_fe,$swap_following_fe,$other_following_fe) = @REORDERING_SMOOTH_FOLLOWING; print STDERR "(7.2) building tables @ ".`date`; open(O,"$___EXTRACT_FILE.$factor.o.sorted") or die "Can't read $___EXTRACT_FILE.$factor.o.sorted"; open(OF, "|gzip >$___MODEL_DIR/orientation-table.$factor.f.$___REORDERING_SMOOTH.gz") if defined($REORDERING_MODEL{"orientation-f"}); open(OFE, "|gzip >$___MODEL_DIR/orientation-table.$factor.fe.$___REORDERING_SMOOTH.gz") if defined($REORDERING_MODEL{"orientation-fe"}); open(OBF, "|gzip >$___MODEL_DIR/orientation-table.$factor.bi.f.$___REORDERING_SMOOTH.gz") if defined($REORDERING_MODEL{"orientation-bidirectional-f"}); open(OBFE,"|gzip >$___MODEL_DIR/orientation-table.$factor.bi.fe.$___REORDERING_SMOOTH.gz") if defined($REORDERING_MODEL{"orientation-bidirectional-fe"}); open(MF, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.f.$___REORDERING_SMOOTH.gz") if defined($REORDERING_MODEL{"monotonicity-f"}); open(MFE, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.fe.$___REORDERING_SMOOTH.gz") if defined($REORDERING_MODEL{"monotonicity-fe"}); open(MBF, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.bi.f.$___REORDERING_SMOOTH.gz") if defined($REORDERING_MODEL{"monotonicity-bidirectional-f"}); open(MBFE,"|gzip >$___MODEL_DIR/monotonicity-table.$factor.bi.fe.$___REORDERING_SMOOTH.gz") if defined($REORDERING_MODEL{"monotonicity-bidirectional-fe"}); my $first = 1; while(<O>) { chomp; my ($f,$e,$o) = split(/ \|\|\| /); my ($o_previous,$o_following) = split(/ /,$o); # store counts if new f,e if ($first) { $f_current = $f; $e_current = $e; $first = 0; } elsif ($f ne $f_current || $e ne $e_current) { if (defined($REORDERING_MODEL{"fe"})) { # compute probs, store them &store_reordering_fe(); # reset counters ($mono_previous_fe,$swap_previous_fe,$other_previous_fe) = @REORDERING_SMOOTH_PREVIOUS; ($mono_following_fe,$swap_following_fe,$other_following_fe) = @REORDERING_SMOOTH_FOLLOWING; } # store counts if new f if ($f ne $f_current && defined($REORDERING_MODEL{"f"})) { # compute probs, store them &store_reordering_f(); # reset counters ($mono_previous_f,$swap_previous_f,$other_previous_f) = @REORDERING_SMOOTH_PREVIOUS; ($mono_following_f,$swap_following_f,$other_following_f) = @REORDERING_SMOOTH_FOLLOWING; } $f_current = $f; $e_current = $e; } # update counts if ($o_previous eq 'mono') { $mono_previous_f++; $mono_previous_fe++; } elsif ($o_previous eq 'swap') { $swap_previous_f++; $swap_previous_fe++; } elsif ($o_previous eq 'other'){ $other_previous_f++; $other_previous_fe++; } else { print STDERR "buggy line (o_previous:$o_previous): $_\n"; } if ($o_following eq 'mono') { $mono_following_f++; $mono_following_fe++; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -