📄 train-factored-phrase-model.perl.svn-base
字号:
# update counts if ($o_previous eq 'mono') { $mono_previous_f++; $mono_previous_fe++; } elsif ($o_previous eq 'swap') { $swap_previous_f++; $swap_previous_fe++; } elsif ($o_previous eq 'other'){ $other_previous_f++; $other_previous_fe++; } else { print STDERR "buggy line (o_previous:$o_previous): $_\n"; } if ($o_following eq 'mono') { $mono_following_f++; $mono_following_fe++; } elsif ($o_following eq 'swap') { $swap_following_f++; $swap_following_fe++; } elsif ($o_following eq 'other'){ $other_following_f++; $other_following_fe++; } else { print STDERR "buggy line (o_following:$o_following): $_\n"; } } if (defined($REORDERING_MODEL{"f"})) { &store_reordering_f(); } if (defined($REORDERING_MODEL{"fe"})) { &store_reordering_fe(); } if (! $debug) { safesystem("rm $___EXTRACT_FILE.$factor.o.sorted") or die;}}sub store_reordering_f { my $total_previous_f = $mono_previous_f+$swap_previous_f+$other_previous_f; my $total_following_f = $mono_following_f+$swap_following_f+$other_following_f; if(defined($REORDERING_MODEL{"msd-f"})) { printf OF ("%s ||| %.5f %.5f %.5f\n", $f_current, $mono_previous_f/$total_previous_f, $swap_previous_f/$total_previous_f, $other_previous_f/$total_previous_f); } if(defined($REORDERING_MODEL{"msd-bidirectional-f"})) { printf OBF ("%s ||| %.5f %.5f %.5f %.5f %.5f %.5f\n", $f_current, $mono_previous_f/$total_previous_f, $swap_previous_f/$total_previous_f, $other_previous_f/$total_previous_f, $mono_following_f/$total_following_f, $swap_following_f/$total_following_f, $other_following_f/$total_following_f); } if(defined($REORDERING_MODEL{"monotonicity-f"})) { printf MF ("%s ||| %.5f %.5f\n", $f_current, $mono_previous_f/$total_previous_f, ($swap_previous_f+$other_previous_f)/$total_previous_f); } if(defined($REORDERING_MODEL{"monotonicity-bidirectional-f"})) { printf MBF ("%s ||| %.5f %.5f %.5f %.5f\n", $f_current, $mono_previous_f/$total_previous_f, ($swap_previous_f+$other_previous_f)/$total_previous_f, $mono_following_f/$total_following_f, ($swap_following_f+$other_following_f)/$total_following_f); }}sub store_reordering_fe { my $total_previous_fe = $mono_previous_fe+$swap_previous_fe+$other_previous_fe; my $total_following_fe = $mono_following_fe+$swap_following_fe+$other_following_fe; if(defined($REORDERING_MODEL{"msd-fe"})) { printf OFE ("%s ||| %s ||| %.5f %.5f %.5f\n", $f_current, $e_current, $mono_previous_fe/$total_previous_fe, $swap_previous_fe/$total_previous_fe, $other_previous_fe/$total_previous_fe); } if(defined($REORDERING_MODEL{"msd-bidirectional-fe"})) { printf OBFE ("%s ||| %s ||| %.5f %.5f %.5f %.5f %.5f %.5f\n", $f_current, $e_current, $mono_previous_fe/$total_previous_fe, $swap_previous_fe/$total_previous_fe, $other_previous_fe/$total_previous_fe, $mono_following_fe/$total_following_fe, $swap_following_fe/$total_following_fe, $other_following_fe/$total_following_fe); } if(defined($REORDERING_MODEL{"monotonicity-fe"})) { printf MFE ("%s ||| %s ||| %.5f %.5f\n", $f_current, $e_current, $mono_previous_fe/$total_previous_fe, ($swap_previous_fe+$other_previous_fe)/$total_previous_fe); } if(defined($REORDERING_MODEL{"monotonicity-bidirectional-fe"})) { printf MBFE ("%s ||| %s ||| %.5f %.5f %.5f %.5f\n", $f_current, $e_current, $mono_previous_fe/$total_previous_fe, ($swap_previous_fe+$other_previous_fe)/$total_previous_fe, $mono_following_fe/$total_following_fe, ($swap_following_fe+$other_following_fe)/$total_following_fe); }}### (8) LEARN GENERATION MODELmy $factor_e_source;sub get_generation_factored { print STDERR "(8) learn generation model @ ".`date`; if (defined $___GENERATION_FACTORS) { my @SPECIFIED_TABLE = @_GENERATION_TABLE; my @TYPE = @_GENERATION_TYPE; foreach my $f (split(/\+/,$___GENERATION_FACTORS)) { $factor = $f; ($factor_e_source,$factor_e) = split(/\-/,$factor); my $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE); my $type = "double"; $type = shift @TYPE if scalar @TYPE; &get_generation($file,$type); } } else { print STDERR " no generation model requested, skipping step\n"; }}sub get_generation { print STDERR "(8) [$factor] generate generation table @ ".`date`; my ($file,$type) = @_; $file = "$___MODEL_DIR/generation.$factor" unless $file; my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH); my %INCLUDE_SOURCE; foreach my $factor (split(/,/,$factor_e_source)) { $INCLUDE_SOURCE{$factor} = 1; } my %INCLUDE; foreach my $factor (split(/,/,$factor_e)) { $INCLUDE{$factor} = 1; } my (%GENERATION,%GENERATION_TOTAL_SOURCE,%GENERATION_TOTAL_TARGET); open(E,$___CORPUS.".".$___E) or die "Can't read ".$___CORPUS.".".$___E; $alignment_id=0; while(<E>) { chomp; foreach (split) { my @FACTOR = split(/\|/); my ($source,$target); my $first_factor = 1; foreach my $factor (split(/,/,$factor_e_source)) { $source .= "|" unless $first_factor; $first_factor = 0; $source .= $FACTOR[$factor]; } $first_factor = 1; foreach my $factor (split(/,/,$factor_e)) { $target .= "|" unless $first_factor; $first_factor = 0; $target .= $FACTOR[$factor]; } $GENERATION{$source}{$target}++; $GENERATION_TOTAL_SOURCE{$source}++; $GENERATION_TOTAL_TARGET{$target}++; } } close(E); safesystem("mkdir -p $___MODEL_DIR") or die; open(GEN,">$file") or die "Can't write $file"; foreach my $source (keys %GENERATION) { foreach my $target (keys %{$GENERATION{$source}}) { printf GEN ("%s %s %.7f ",$source,$target, $GENERATION{$source}{$target}/$GENERATION_TOTAL_SOURCE{$source}); printf GEN (" %.7f", $GENERATION{$source}{$target}/$GENERATION_TOTAL_TARGET{$target}) unless $type eq 'single'; print GEN "\n"; } } close(GEN); safesystem("rm -f $file.gz") or die; safesystem("gzip $file") or die;}### (9) CREATE CONFIGURATION FILEsub create_ini { print STDERR "(9) create moses.ini @ ".`date`; &full_path(\$___MODEL_DIR); &full_path(\$___VCB_E); &full_path(\$___VCB_F); `mkdir -p $___MODEL_DIR`; open(INI,">$___CONFIG") or die("Can't write $___CONFIG"); print INI "############################ MOSES CONFIG FILE ############################\n"; if (defined $___TRANSLATION_FACTORS) { print INI "# input factors\n"; print INI "[input-factors]\n"; my $INPUT_FACTOR_MAX = 0; foreach my $table (split /\+/, $___TRANSLATION_FACTORS) { my ($factor_list, $output) = split /-+/, $table; foreach (split(/,/,$factor_list)) { $INPUT_FACTOR_MAX = $_ if $_>$INPUT_FACTOR_MAX; } } $INPUT_FACTOR_MAX = $_INPUT_FACTOR_MAX if $_INPUT_FACTOR_MAX; # use specified, if exists for (my $c = 0; $c <= $INPUT_FACTOR_MAX; $c++) { print INI "$c\n"; } } else { die "No translation steps defined, cannot prepare [input-factors] section\n"; } my %stepsused; print INI "\n# mapping steps[mapping]\n"; my $steplist = 0; foreach my $list (split(/:/,$___DECODING_STEPS)) { foreach (split(/,/,$list)) { s/t/T /g; s/g/G /g; my ($type, $num) = split /\s+/; $stepsused{$type} = $num+1 if !defined $stepsused{$type} || $stepsused{$type} < $num+1; print INI $steplist," ",$_,"\n"; } $steplist++; } print INI "\n# translation tables: source-factors, target-factors, number of scores, file [ttable-file]\n"; my $num_of_ttables = 0; my @SPECIFIED_TABLE = @_PHRASE_TABLE; foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { $num_of_ttables++; my $ff = $f; $ff =~ s/\-/ /; my $file = "$___MODEL_DIR/phrase-table.$f.gz"; $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE); print INI "$ff 5 $file\n"; } if ($num_of_ttables != $stepsused{"T"}) { print STDERR "WARNING: Your [mapping-steps] require translation steps up to id $stepsused{T} but you defined translation steps 0..$num_of_ttables\n"; exit 1 if $num_of_ttables < $stepsused{"T"}; # fatal to define less } if (defined $___GENERATION_FACTORS) { my @TYPE = @_GENERATION_TYPE; print INI "\n# generation models: source-factors, target-factors, number-of-weights, filename\n"; print INI "[generation-file]\n"; my $cnt = 0; my @SPECIFIED_TABLE = @_GENERATION_TABLE; foreach my $f (split(/\+/,$___GENERATION_FACTORS)) { my $weights_per_generation_model = 2; $weights_per_generation_model = 1 if (shift @TYPE) eq 'single'; $cnt++; my $ff = $f; $ff =~ s/\-/ /; my $file = "$___MODEL_DIR/generation.$f"; $file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE); print INI "$ff $weights_per_generation_model $file\n"; } if ($cnt != $stepsused{"G"}) { print STDERR "WARNING: Your [mapping-steps] require generation steps up to id $stepsused{G} but you defined generation steps 0..$cnt\n"; exit 1 if $cnt < $stepsused{"G"}; # fatal to define less } } else { print INI "\n# no generation models, no generation-file section\n"; }print INI "\n# language models: type(srilm/irstlm), factors, order, file[lmodel-file]\n"; foreach my $lm (@___LM) { my ($f, $o, $fn, $type) = @{$lm}; print INI "$type $f $o $fn\n"; }print INI "\n\n\# limit on how many phrase translations e for each phrase f are loaded# 0 = all elements loaded[ttable-limit]20\n"; foreach(1..$num_of_ttables) { print INI "0\n"; } my $weight_d_count = 1; if ($___REORDERING ne "distance") { my $file = "# distortion (reordering) files\n\[distortion-file]\n"; my $factor_i = 0; my @SPECIFIED_TABLE = @_REORDERING_TABLE; foreach my $factor (split(/\+/,$___REORDERING_FACTORS)) { foreach my $r (keys %REORDERING_MODEL) { next if $r eq "fe" || $r eq "f"; next if $r eq "distance" && $factor_i>0; if ($r eq "distance") { $weight_d_count++; } else { my $type = $r; $r =~ s/-bidirectional/.bi/; $r =~ s/-f/.f/; $r =~ s/msd/msd-table.$factor/; $r =~ s/monotonicity/monotonicity-table.$factor/; my $w; if ($r =~ /msd/) { $w = 3; } else { $w = 1; } if ($r =~ /bi/) { $w *= 2; } $weight_d_count += $w; my $table_file = "$___MODEL_DIR/reordering-table.$type.$___REORDERING_SMOOTH.$factor.gz"; $table_file = shift @SPECIFIED_TABLE if scalar(@SPECIFIED_TABLE); $file .= "$factor $type $w $table_file\n"; } } $factor_i++; } print INI $file."\n"; } else { $weight_d_count = 1; } print INI "# distortion (reordering) weight\n[weight-d]\n"; for(my $i=0;$i<$weight_d_count;$i++) { print INI "".(0.6/(scalar keys %REORDERING_MODEL))."\n"; } print INI "\n# language model weights[weight-l]\n"; my $lmweighttotal = 0.5; foreach(1..scalar @___LM) { printf INI "%.4f\n", $lmweighttotal / scalar @___LM; }print INI "\n\n# translation model weights[weight-t]\n"; foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { print INI "0.2\n0.2\n0.2\n0.2\n0.2\n"; } if (defined $___GENERATION_FACTORS) { print INI "\n# generation model weights\n"; print INI "[weight-generation]\n"; my @TYPE = @_GENERATION_TYPE; foreach my $f (split(/\+/,$___GENERATION_FACTORS)) { print INI "0.3\n"; print INI "0\n" unless (shift @TYPE) eq 'single'; } } else { print INI "\n# no generation models, no weight-generation section\n"; }print INI "\n# word penalty[weight-w]-1[distortion-limit]6"; # only set the factor delimiter if it is non-standard unless ($___FACTOR_DELIMITER eq '|') { print INI "\n# delimiter between factors in input\n[factor-delimiter]\n$___FACTOR_DELIMITER\n\n" } close(INI);}sub full_path { my ($PATH) = @_;$$PATH =~ s/\/nfsmnt//; return if $$PATH =~ /^\//; my $dir = `pawd 2>/dev/null`; if(!$dir){$dir = `pwd`;}$PATH =~ s/\/nfsmnt//; chomp $dir; $$PATH = $dir."/".$$PATH; $$PATH =~ s/[\r\n]//g; $$PATH =~ s/\/\.\//\//g; $$PATH =~ s/\/+/\//g; my $sanity = 0; while($$PATH =~ /\/\.\.\// && $sanity++<10) { $$PATH =~ s/\/+/\//g; $$PATH =~ s/\/[^\/]+\/\.\.\//\//g; } $$PATH =~ s/\/[^\/]+\/\.\.$//; $$PATH =~ s/\/+$//;}sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -