📄 mert-moses.pl.svn-base
字号:
}# as weights are normalized in the next steps (by cmert)# normalize initial LAMBDAs, toomy $need_to_normalize = 1;my @order_of_lambdas_from_decoder = ();# this will store the labels of scores coming out of the decoder (and hence the order of lambdas coming out of mert)# we will use the array to interpret the lambdas# the array gets filled with labels only after first nbestlist was generated#store current directory and create the working directory (if needed)my $cwd = `pawd 2>/dev/null`; if(!$cwd){$cwd = `pwd`;}chomp($cwd);safesystem("mkdir -p $___WORKING_DIR") or die "Can't mkdir $___WORKING_DIR";{# open local scope#chdir to the working directorychdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR";# set start runmy $start_run = 1;if ($continue) { # need to load last best values print STDERR "Trying to continue an interrupted optimization.\n"; open IN, "finished_step.txt" or die "Failed to find the step number, failed to read finished_step.txt"; my $step = <IN>; chomp $step; $step++; close IN; if (! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz") { # allow stepping one extra iteration back $step--; die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!" if ! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz"; } $start_run = $step +1; print STDERR "Reading last cached lambda values (result from step $step)\n"; @order_of_lambdas_from_decoder = get_order_of_scores_from_nbestlist("gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |"); open IN, "weights.txt" or die "Can't read weights.txt"; my $newweights = <IN>; chomp $newweights; close IN; my @newweights = split /\s+/, $newweights; #dump_triples(\%used_triples); store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights); #dump_triples(\%used_triples);}if ($___FILTER_PHRASE_TABLE){ # filter the phrase tables wih respect to input, use --decoder-flags print "filtering the phrase tables... ".`date`; my $cmd = "$filtercmd ./filtered $___CONFIG $___DEV_F"; if (defined $___JOBS) { safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=filterphrases.out -stderr=filterphrases.err" ) or die "Failed to submit filtering of tables to the queue (via $qsubwrapper)"; } else { safesystem($cmd) or die "Failed to filter the tables."; } # make a backup copy of startup ini file $___CONFIG_BAK = $___CONFIG; # the decoder should now use the filtered model $___CONFIG = "filtered/moses.ini";}else{ # do not filter phrase tables (useful if binary phrase tables are available) # use the original configuration file $___CONFIG_BAK = $___CONFIG;}my $PARAMETERS;#$PARAMETERS = $___DECODER_FLAGS . " -config $___CONFIG -inputtype $___INPUTTYPE";$PARAMETERS = $___DECODER_FLAGS;my $devbleu = undef;my $bestpoint = undef;my $run=$start_run-1;my $oldallsorted = undef;my $allsorted = undef;my $prev_aggregate_nbl_size = -1;while(1) { $run++; # run beamdecoder with option to output nbestlists # the end result should be (1) @NBEST_LIST, a list of lists; (2) @SCORE, a list of lists of lists print "run $run start at ".`date`; # In case something dies later, we might wish to have a copy create_config($___CONFIG, "./run$run.moses.ini", \%used_triples, $run, (defined$devbleu?$devbleu:"--not-estimated--")); # skip if the user wanted if (!$skip_decoder) { print "($run) run decoder to produce n-best lists\n"; @order_of_lambdas_from_decoder = run_decoder(\%used_triples, $PARAMETERS, $run, \@order_of_lambdas_from_decoder, $need_to_normalize); $need_to_normalize = 0; safesystem("gzip -f run*out") or die "Failed to gzip run*out"; } else { print "skipped decoder run\n"; if (0 == scalar @order_of_lambdas_from_decoder) { @order_of_lambdas_from_decoder = get_order_of_scores_from_nbestlist("gunzip -dc run*.best*.out.gz | head -1 |"); } $skip_decoder = 0; $need_to_normalize = 0; } my $EFF_REF_LEN = ""; if ($___AVERAGE) { $EFF_REF_LEN = "-a"; }elsif ($___CLOSEST){ $EFF_REF_LEN = "-e"; } my $EFF_NORM = ""; if ($___NONORM) { $EFF_NORM = "-n"; } # To be sure that scoring script produses these fresh: if (-e "cands.opt"){ safesystem("\\rm -f cands.opt") or die; } if (-e "feats.opt"){ safesystem("\\rm -f feats.opt") or die; } # convert n-best list into a numberized format with error scores print STDERR "Scoring the nbestlist.\n"; my $aggregate_nbl_size=0; if (defined $obo_scorenbest) { # Faster scoring method, never rescore previous iterations my $cmd = "zcat run$run.best*.out.gz | $obo_scorenbest ".join(" ", @references); my $targetfile = "run$run.feats"; if (defined $___JOBS) { safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=$targetfile -stderr=run$run.scorenbest.err") or die "Failed to submit scoring nbestlist to queue (via $qsubwrapper)"; } else { safesystem("$cmd > $targetfile") or die "Failed to score nbestlist"; } print STDERR "Combining all run*.feats\n"; $cmd = "sort -n -t: -k1,1 run*.feats | cut -d: -f2- > feats.opt"; safesystem($cmd) or die "Failed to create feats.opt"; print STDERR "Creating cands.opt\n"; open C, "cut -d: -f1 run*.feats | uniq -c |" or die "Failed to load counts from run*.feats"; my @cnts = (); while (<C>) { chomp; s/^\s+//; s/\s+$//; my ($cnt, $sent) = split /\s+/; $aggregate_nbl_size += $cnt; $cnts[$sent]+=$cnt; } close C; print STDERR "Total candidates: $aggregate_nbl_size in ".(scalar @cnts)." sentences\n"; die "Lost all candidates!" if $aggregate_nbl_size == 0; open C, ">cands.opt" or die "Failed to create cands.opt"; for (my $i=0; $i<@cnts; $i++) { print C "$i $cnts[$i]\n"; } close C; } else { # traditional scoring code my $cmd; if (defined $efficient_scorenbest_flag){# time-efficient sorting method of nbest lists $oldallsorted="all.sorted.run".($run-1).".best$___N_BEST_LIST_SIZE"; $allsorted="all.sorted.run$run.best$___N_BEST_LIST_SIZE"; # Create an empty file for the first iteration if ($run == 1){ safesystem("touch $oldallsorted"); }; if (-e $oldallsorted){ # the mert process works properly; the sorted file containing all previous nbests are already present $cmd = "gunzip -dc run$run.best$___N_BEST_LIST_SIZE.out.gz | sort -m -n -t \"|\" -k 1,1 $oldallsorted - > $allsorted ; rm $oldallsorted ; cat $allsorted | $SCORENBESTCMD $EFF_NORM $EFF_REF_LEN ".join(" ", @references)." ./"; } else{ # the mert process did not work properly; the sorted file containing all previous nbests is no more present; create again $cmd = "gzip -d run*.best$___N_BEST_LIST_SIZE.out.gz ; sort -m -n -t \"|\" -k 1,1 run*.best$___N_BEST_LIST_SIZE.out > $allsorted ; gzip run*.best$___N_BEST_LIST_SIZE.out ; cat $allsorted | $SCORENBESTCMD $EFF_NORM $EFF_REF_LEN ".join(" ", @references)." ./"; } } else{ # traditional scoring code $cmd = "gunzip -dc run*.best*.out.gz | sort -n -t \"|\" -k 1,1 | $SCORENBESTCMD $EFF_NORM $EFF_REF_LEN ".join(" ", @references)." ./"; } if (defined $___JOBS) { $cmd = "setenv PYTHONPATH $pythonpath ; $cmd"; safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=scorenbest.out -stderr=scorenbest.err") or die "Failed to submit scoring nbestlist to queue (via $qsubwrapper)"; } else { safesystem($cmd) or die "Failed to score nbestlist"; } print STDERR "Hoping that scoring succeeded. We'll see if we can read the output files now.\n"; # keep a count of lines in nbests lists (alltogether) # if it did not increase since last iteration, we are DONE open(IN,"cands.opt") or die "Can't read cands.opt"; while (<IN>) { chomp; my @flds = split / /; $aggregate_nbl_size += $flds[1]; } close(IN); } print "$aggregate_nbl_size accumulated translations\n"; print "prev accumulated translations was : $prev_aggregate_nbl_size\n"; if ($aggregate_nbl_size <= $prev_aggregate_nbl_size){ print STDERR "No new hypotheses in nbest list. Stopping.\n"; last; } $prev_aggregate_nbl_size = $aggregate_nbl_size; # run cmert # cmert reads in the file init.opt containing three lines: # minimum values # maximum values # current values # We need to prepare the files and **the order of the lambdas must # correspond to the order @order_of_lambdas_from_decoder my @MIN = (); # lower bounds my @MAX = (); # upper bounds my @CURR = (); # the starting values my @NAME = (); # to which model does the lambda belong # walk in order of @order_of_lambdas_from_decoder and collect the min,max,val my %visited = (); foreach my $name (@order_of_lambdas_from_decoder) { next if $visited{$name}; $visited{$name} = 1; if (!defined $used_triples{$name}) { die "The decoder produced also some '$name' scores, but we do not know the ranges for them, no way to optimize them\n"; } my $count = 0; foreach my $feature (@{$used_triples{$name}}) { $count++; my ($val, $min, $max) = @$feature; push @CURR, $val; push @MIN, $min; push @MAX, $max; push @NAME, $name; } } open(OUT,"> init.opt") or die "Can't write init.opt (WD now $___WORKING_DIR)"; print OUT join(" ", @MIN)."\n"; print OUT join(" ", @MAX)."\n"; print OUT join(" ", @CURR)."\n"; close(OUT); #just for brevity open(OUT,"> names.txt") or die "Can't write names.txt (WD now $___WORKING_DIR)"; print OUT join(" ", @NAME)."\n"; close(OUT); # make a backup copy labelled with this run number safesystem("\\cp -f init.opt run$run.init.opt") or die; my $DIM = scalar(@CURR); # number of lambdas my $cmd="$cmertcmd -d $DIM"; print STDERR "Starting cmert.\n"; if (defined $___JOBS) { $cmd="setenv SCRIPTS_ROOTDIR $SCRIPTS_ROOTDIR ; $cmd"; safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -stderr=cmert.log -queue-parameter=\"$queue_flags\"") or die "Failed to start cmert (via qsubwrapper $qsubwrapper)"; } else { safesystem("$cmd 2> cmert.log") or die "Failed to run cmert"; } die "Optimization failed, file weights.txt does not exist or is empty" if ! -s "weights.txt"; # backup copies safesystem ("\\mv -f feats.opt run$run.feats.opt; gzip run$run.feats.opt; ") or die; safesystem ("\\mv -f cands.opt run$run.cands.opt") or die; safesystem ("\\cp -f cmert.log run$run.cmert.log") or die; safesystem ("\\cp -f weights.txt run$run.weights.txt") or die; # this one is needed for restarts, too if ($___ACTIVATE_FEATURES){ safesystem ("\\mv -f reduced_feats.opt run$run.reduced_feats.opt ; gzip run$run.reduced_feats.opt") or die; safesystem ("\\mv -f reduced_init.opt run$run.reduced_init.opt") or die; safesystem ("\\mv -f reduced_weights.txt run$run.reduced_weights.txt") or die; safesystem ("\\mv -f reduced_cmert.log run$run.reduced_cmert.log") or die; } print "run $run end at ".`date`; $bestpoint = undef; $devbleu = undef; open(IN,"cmert.log") or die "Can't open cmert.log"; while (<IN>) { if (/Best point:\s*([\s\d\.\-]+?)\s*=> ([\d\.]+)/) { $bestpoint = $1; $devbleu = $2; last; } } close IN; die "Failed to parse cmert.log, missed Best point there." if !defined $bestpoint || !defined $devbleu; print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`; my @newweights = split /\s+/, $bestpoint; # update my cache of lambda values store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights); ## additional stopping criterion: weights have not changed my $shouldstop = 1; for(my $i=0; $i<@CURR; $i++) { die "Lost weight! cmert reported fewer weights (@newweights) than we gave it (@CURR)" if !defined $newweights[$i]; if (abs($CURR[$i] - $newweights[$i]) >= $minimum_required_change_in_weights) { $shouldstop = 0; last; } } open F, "> finished_step.txt" or die "Can't mark finished step"; print F $run."\n"; close F; if ($shouldstop) { print STDERR "None of the weights changed more than $minimum_required_change_in_weights. Stopping.\n"; last; }}print "Training finished at ".`date`;if (defined $allsorted){ safesystem ("\\rm -f $allsorted") or die; };safesystem("\\cp -f init.opt run$run.init.opt") or die;safesystem("\\cp -f cmert.log run$run.cmert.log") or die;create_config($___CONFIG_BAK, "./moses.ini", \%used_triples, $run, $devbleu);# just to be sure that we have the really last finished step markedopen F, "> finished_step.txt" or die "Can't mark finished step";print F $run."\n";close F;#chdir back to the original directory # useless, just to remind we were not therechdir($cwd);} # end of local scopesub store_new_lambda_values { # given new lambda values (in given order), replace the 'val' element in our triples my $triples = shift; my $names = shift; my $values = shift; my %idx = (); foreach my $i (0..scalar(@$values)-1) { my $name = $names->[$i]; die "Missed name for lambda $values->[$i] (in @$values; names: @$names)" if !defined $name; if (!defined $idx{$name}) { $idx{$name} = 0; } else { $idx{$name}++; } die "We did not optimize '$name', but moses returned it back to us" if !defined $triples->{$name};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -