📄 train-factored-phrase-model.perl.svn-base
字号:
my $out = $FACTOR[$outfactor]; die "Couldn't find factor $outfactor in token \"$_\" in $full LINE $nr" if !defined $out; print OUT $out; } # for(my $factor=0;$factor<=$#FACTOR;$factor++) { # next unless defined($INCLUDE{$factor}); # print OUT "|" unless $first_factor; # $first_factor = 0; # print OUT $FACTOR[$factor]; # } } print OUT "\n"; } print STDERR "\n"; close(OUT); close(IN); `rm -f $reduced.lock`;}sub make_classes { my ($corpus,$classes) = @_; my $cmd = "$MKCLS -c50 -n2 -p$corpus -V$classes opt"; print STDERR "(1.1) running mkcls @ ".`date`."$cmd\n"; if (-e $classes) { print STDERR " $classes already in place, reusing\n"; return; } safesystem("$cmd"); # ignoring the wrong exit code from mkcls (not dying)}sub get_vocabulary { return unless $___LEXICAL_WEIGHTING; my($corpus,$vcb) = @_; print STDERR "(1.2) creating vcb file $vcb @ ".`date`; my %WORD; open(TXT,$corpus) or die "Can't read $corpus"; while(<TXT>) { chop; foreach (split) { $WORD{$_}++; } } close(TXT); my @NUM; foreach my $word (keys %WORD) { my $vcb_with_number = sprintf("%07d %s",$WORD{$word},$word); push @NUM,$vcb_with_number; } my %VCB; open(VCB,">$vcb") or die "Can't write $vcb"; print VCB "1\tUNK\t0\n"; my $id=2; foreach (reverse sort @NUM) { my($count,$word) = split; printf VCB "%d\t%s\t%d\n",$id,$word,$count; $VCB{$word} = $id; $id++; } close(VCB); return \%VCB;}sub numberize_txt_file { my ($VCB_DE,$in_de,$VCB_EN,$in_en,$out) = @_; my %OUT; print STDERR "(1.3) numberizing corpus $out @ ".`date`; if (-e $out) { print STDERR " $out already in place, reusing\n"; return; } open(IN_DE,$in_de) or die "Can't read $in_de"; open(IN_EN,$in_en) or die "Can't read $in_en"; open(OUT,">$out") or die "Can't write $out"; while(my $de = <IN_DE>) { my $en = <IN_EN>; print OUT "1\n"; print OUT &numberize_line($VCB_EN,$en); print OUT &numberize_line($VCB_DE,$de); } close(IN_DE); close(IN_EN); close(OUT);}sub numberize_line { my ($VCB,$txt) = @_; chomp($txt); my $out = ""; my $not_first = 0; foreach (split(/ /,$txt)) { next if $_ eq ''; $out .= " " if $not_first++; print STDERR "Unknown word '$_'\n" unless defined($$VCB{$_}); $out .= $$VCB{$_}; } return $out."\n";}### (2) RUN GIZAsub run_giza { return &run_giza_on_parts if $___PARTS>1; print STDERR "(2) running giza @ ".`date`; if ($___DIRECTION == 1 || $___DIRECTION == 2 || $___NOFORK) { &run_single_giza($___GIZA_F2E,$___E,$___F, $___VCB_E,$___VCB_F, $___CORPUS_DIR."/$___F-$___E-int-train.snt") unless $___DIRECTION == 2; &run_single_giza($___GIZA_E2F,$___F,$___E, $___VCB_F,$___VCB_E, $___CORPUS_DIR."/$___E-$___F-int-train.snt") unless $___DIRECTION == 1; } else { my $pid = fork(); if (!defined $pid) { die "Failed to fork"; } if (!$pid) { # i'm the child &run_single_giza($___GIZA_F2E,$___E,$___F, $___VCB_E,$___VCB_F, $___CORPUS_DIR."/$___F-$___E-int-train.snt"); exit 0; # child exits } else { #i'm the parent &run_single_giza($___GIZA_E2F,$___F,$___E, $___VCB_F,$___VCB_E, $___CORPUS_DIR."/$___E-$___F-int-train.snt"); } printf "Waiting for second GIZA process...\n"; waitpid($pid, 0); }}sub run_giza_on_parts { print STDERR "(2) running giza on $___PARTS cooc parts @ ".`date`; my $size = `cat $___CORPUS_DIR/$___F-$___E-int-train.snt | wc -l`; die "Failed to get number of lines in $___CORPUS_DIR/$___F-$___E-int-train.snt" if $size == 0; if ($___DIRECTION == 1 || $___DIRECTION == 2 || $___NOFORK) { &run_single_giza_on_parts($___GIZA_F2E,$___E,$___F, $___VCB_E,$___VCB_F, $___CORPUS_DIR."/$___F-$___E-int-train.snt",$size) unless $___DIRECTION == 2; &run_single_giza_on_parts($___GIZA_E2F,$___F,$___E, $___VCB_F,$___VCB_E, $___CORPUS_DIR."/$___E-$___F-int-train.snt",$size) unless $___DIRECTION == 1; } else { my $pid = fork(); if (!defined $pid) { die "Failed to fork"; } if (!$pid) { # i'm the child &run_single_giza_on_parts($___GIZA_F2E,$___E,$___F, $___VCB_E,$___VCB_F, $___CORPUS_DIR."/$___F-$___E-int-train.snt",$size); exit 0; # child exits } else { #i'm the parent &run_single_giza_on_parts($___GIZA_E2F,$___F,$___E, $___VCB_F,$___VCB_E, $___CORPUS_DIR."/$___E-$___F-int-train.snt",$size); } printf "Waiting for second GIZA process...\n"; waitpid($pid, 0); }}sub run_single_giza_on_parts { my($dir,$e,$f,$vcb_e,$vcb_f,$train,$size) = @_; my $part = 0; # break up training data into parts open(SNT,$train) or die "Can't read $train"; { my $i=0; while(<SNT>) { $i++; if ($i%3==1 && $part < ($___PARTS*$i)/$size && $part<$___PARTS) { close(PART) if $part; $part++; safesystem("mkdir -p $___CORPUS_DIR/part$part") or die; open(PART,">$___CORPUS_DIR/part$part/$f-$e-int-train.snt") or die "Can't write $___CORPUS_DIR/part$part/$f-$e-int-train.snt"; } print PART $_; } } close(PART); close(SNT); # run snt2cooc in parts for(my $i=1;$i<=$___PARTS;$i++) { &run_single_snt2cooc("$dir/part$i",$e,$f,$vcb_e,$vcb_f,"$___CORPUS_DIR/part$i/$f-$e-int-train.snt"); } # merge parts open(COOC,">$dir/$f-$e.cooc") or die "Can't write $dir/$f-$e.cooc"; my(@PF,@CURRENT); for(my $i=1;$i<=$___PARTS;$i++) { open($PF[$i],"$dir/part$i/$f-$e.cooc")or die "Can't read $dir/part$i/$f-$e.cooc"; my $pf = $PF[$i]; $CURRENT[$i] = <$pf>; chop($CURRENT[$i]) if $CURRENT[$i]; } while(1) { my ($min1,$min2) = (1e20,1e20); for(my $i=1;$i<=$___PARTS;$i++) { next unless $CURRENT[$i]; my ($w1,$w2) = split(/ /,$CURRENT[$i]); if ($w1 < $min1 || ($w1 == $min1 && $w2 < $min2)) { $min1 = $w1; $min2 = $w2; } } last if $min1 == 1e20; print COOC "$min1 $min2\n"; for(my $i=1;$i<=$___PARTS;$i++) { next unless $CURRENT[$i]; my ($w1,$w2) = split(/ /,$CURRENT[$i]); if ($w1 == $min1 && $w2 == $min2) { my $pf = $PF[$i]; $CURRENT[$i] = <$pf>; chop($CURRENT[$i]) if $CURRENT[$i]; } } } for(my $i=1;$i<=$___PARTS;$i++) { close($PF[$i]); } close(COOC); # run giza &run_single_giza($dir,$e,$f,$vcb_e,$vcb_f,$train);}sub run_single_giza { my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_; my %GizaDefaultOptions = (p0 => .999 , m1 => 5 , m2 => 0 , m3 => 3 , m4 => 3 , o => "giza" , nodumps => 1 , onlyaldumps => 1 , nsmooth => 4 , model1dumpfrequency => 1, model4smoothfactor => 0.4 , t => $vcb_f, s => $vcb_e, c => $train, CoocurrenceFile => "$dir/$f-$e.cooc", o => "$dir/$f-$e"); if ($_HMM_ALIGN) { $GizaDefaultOptions{m3} = 0; $GizaDefaultOptions{m4} = 0; $GizaDefaultOptions{hmmiterations} = 5; $GizaDefaultOptions{hmmdumpfrequency} = 5; $GizaDefaultOptions{nodumps} = 0; } if ($___GIZA_OPTION) { foreach (split(/[ ,]+/,$___GIZA_OPTION)) { my ($option,$value) = split(/=/,$_,2); $GizaDefaultOptions{$option} = $value; } } my $GizaOptions; foreach my $option (sort keys %GizaDefaultOptions){ my $value = $GizaDefaultOptions{$option} ; $GizaOptions .= " -$option $value" ; } &run_single_snt2cooc($dir,$e,$f,$vcb_e,$vcb_f,$train) if $___PARTS == 1; print STDERR "(2.1b) running giza $f-$e @ ".`date`."$GIZA $GizaOptions\n"; if (-e "$dir/$f-$e.$___GIZA_EXTENSION.gz") { print " $dir/$f-$e.$___GIZA_EXTENSION.gz seems finished, reusing.\n"; return; } print "$GIZA $GizaOptions\n"; return if $___ONLY_PRINT_GIZA; safesystem("$GIZA $GizaOptions"); die "Giza did not produce the output file $dir/$f-$e.$___GIZA_EXTENSION. Is your corpus clean (reasonably-sized sentences)?" if ! -e "$dir/$f-$e.$___GIZA_EXTENSION"; safesystem("rm -f $dir/$f-$e.$___GIZA_EXTENSION.gz") or die; safesystem("gzip $dir/$f-$e.$___GIZA_EXTENSION") or die;}sub run_single_snt2cooc { my($dir,$e,$f,$vcb_e,$vcb_f,$train) = @_; print STDERR "(2.1a) running snt2cooc $f-$e @ ".`date`."\n"; safesystem("mkdir -p $dir") or die; print "$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc\n"; safesystem("$SNT2COOC $vcb_e $vcb_f $train > $dir/$f-$e.cooc") or die;}### (3) CREATE WORD ALIGNMENT FROM GIZA ALIGNMENTSsub word_align { print STDERR "(3) generate word alignment @ ".`date`; my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH); print STDERR "Combining forward and inverted alignment from files:\n"; print STDERR " $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.{bz2,gz}\n"; print STDERR " $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.{bz2,gz}\n"; ### build arguments for giza2bal.pl my($__ALIGNMENT_CMD,$__ALIGNMENT_INV_CMD); if (-e "$___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.bz2"){ $__ALIGNMENT_CMD="\"$BZCAT $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.bz2\""; } elsif (-e "$___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.gz") { $__ALIGNMENT_CMD="\"$ZCAT $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.gz\""; } else { die "Can't read $___GIZA_F2E/$___F-$___E.$___GIZA_EXTENSION.{bz2,gz}\n"; } if ( -e "$___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.bz2"){ $__ALIGNMENT_INV_CMD="\"$BZCAT $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.bz2\""; }elsif (-e "$___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.gz"){ $__ALIGNMENT_INV_CMD="\"$ZCAT $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.gz\""; }else{ die "Can't read $___GIZA_E2F/$___E-$___F.$___GIZA_EXTENSION.{bz2,gz}\n\n"; } safesystem("mkdir -p $___MODEL_DIR") or die; #build arguments for symal my($__symal_a)=""; $__symal_a="union" if $___ALIGNMENT eq 'union'; $__symal_a="intersect" if $___ALIGNMENT=~ /intersect/; $__symal_a="grow" if $___ALIGNMENT=~ /grow/; $__symal_a="srctotgt" if $___ALIGNMENT=~ /srctotgt/; $__symal_a="tgttosrc" if $___ALIGNMENT=~ /tgttosrc/; my($__symal_d,$__symal_f,$__symal_b); ($__symal_d,$__symal_f,$__symal_b)=("no","no","no"); $__symal_d="yes" if $___ALIGNMENT=~ /diag/; $__symal_f="yes" if $___ALIGNMENT=~ /final/; $__symal_b="yes" if $___ALIGNMENT=~ /final-and/; safesystem("$GIZA2BAL -d $__ALIGNMENT_INV_CMD -i $__ALIGNMENT_CMD |". "$SYMAL -alignment=\"$__symal_a\" -diagonal=\"$__symal_d\" ". "-final=\"$__symal_f\" -both=\"$__symal_b\" > ". "$___ALIGNMENT_FILE.$___ALIGNMENT") || die "Can't generate symmetrized alignment file\n"}### (4) BUILDING LEXICAL TRANSLATION TABLEsub get_lexical_factored { print STDERR "(4) generate lexical translation table $___TRANSLATION_FACTORS @ ".`date`; foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { $factor = $f; ($factor_f,$factor_e) = split(/\-/,$factor); &reduce_factors($___CORPUS.".".$___F.$___CORPUS_COMPRESSION, $___ALIGNMENT_FILE.".".$factor_f.".".$___F, $factor_f); &reduce_factors($___CORPUS.".".$___E.$___CORPUS_COMPRESSION, $___ALIGNMENT_FILE.".".$factor_e.".".$___E, $factor_e); &get_lexical(); }}sub get_lexical { print STDERR "(4) [$factor] generate lexical translation table @ ".`date`; my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH); if (-e "$___LEXICAL_FILE.$factor.f2n" && -e "$___LEXICAL_FILE.$factor.n2f") { print STDERR " reusing: $___LEXICAL_FILE.$factor.f2n and $___LEXICAL_FILE.$factor.n2f\n"; return; } &open_alignment(); while(my $e = <E>) { if (($alignment_id++ % 1000) == 0) { print STDERR "!"; } chomp($e); my @ENGLISH = split(/ /,$e); my $f = <F>; chomp($f); my @FOREIGN = split(/ /,$f); my $a = <A>; chomp($a); my (%FOREIGN_ALIGNED,%ENGLISH_ALIGNED); foreach (split(/ /,$a)) { my ($fi,$ei) = split(/\-/); if ($fi >= scalar(@FOREIGN) || $ei >= scalar(@ENGLISH)) { print STDERR "alignment point ($fi,$ei) out of range (0-$#FOREIGN,0-$#ENGLISH) in line $alignment_id, ignoring\n";
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -