mert-moses.pl.svn-base

来自「moses开源的机器翻译系统」· SVN-BASE 代码 · 共 1,162 行 · 第 1/3 页
SVN-BASE
1,162 行
}# as weights are normalized in the next steps (by cmert)# normalize initial LAMBDAs, toomy $need_to_normalize = 1;my @order_of_lambdas_from_decoder = ();# this will store the labels of scores coming out of the decoder (and hence the order of lambdas coming out of mert)# we will use the array to interpret the lambdas# the array gets filled with labels only after first nbestlist was generated#store current directory and create the working directory (if needed)my $cwd = `pawd 2>/dev/null`; if(!$cwd){$cwd = `pwd`;}chomp($cwd);safesystem("mkdir -p $___WORKING_DIR") or die "Can't mkdir $___WORKING_DIR";{# open local scope#chdir to the working directorychdir($___WORKING_DIR) or die "Can't chdir to $___WORKING_DIR";# set start runmy $start_run = 1;if ($continue) {  # need to load last best values  print STDERR "Trying to continue an interrupted optimization.\n";  open IN, "finished_step.txt" or die "Failed to find the step number, failed to read finished_step.txt";  my $step = <IN>;  chomp $step;  $step++;  close IN;  if (! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz") {    # allow stepping one extra iteration back    $step--;    die "Can't start from step $step, because run$step.best$___N_BEST_LIST_SIZE.out.gz was not found!"      if ! -e "run$step.best$___N_BEST_LIST_SIZE.out.gz";  }  $start_run = $step +1;  print STDERR "Reading last cached lambda values (result from step $step)\n";  @order_of_lambdas_from_decoder = get_order_of_scores_from_nbestlist("gunzip -c < run$step.best$___N_BEST_LIST_SIZE.out.gz |");  open IN, "weights.txt" or die "Can't read weights.txt";  my $newweights = <IN>;  chomp $newweights;  close IN;  my @newweights = split /\s+/, $newweights;  #dump_triples(\%used_triples);  store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights);  #dump_triples(\%used_triples);}if ($___FILTER_PHRASE_TABLE){  # filter the phrase tables wih respect to input, use --decoder-flags  print "filtering the phrase tables... ".`date`;  my $cmd = "$filtercmd ./filtered $___CONFIG $___DEV_F";  if (defined $___JOBS) {    safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=filterphrases.out -stderr=filterphrases.err" )      or die "Failed to submit filtering of tables to the queue (via $qsubwrapper)";  } else {    safesystem($cmd) or die "Failed to filter the tables.";  }  # make a backup copy of startup ini file  $___CONFIG_BAK = $___CONFIG;  # the decoder should now use the filtered model  $___CONFIG = "filtered/moses.ini";}else{  # do not filter phrase tables (useful if binary phrase tables are available)  # use the original configuration file  $___CONFIG_BAK = $___CONFIG;}my $PARAMETERS;#$PARAMETERS = $___DECODER_FLAGS . " -config $___CONFIG -inputtype $___INPUTTYPE";$PARAMETERS = $___DECODER_FLAGS;my $devbleu = undef;my $bestpoint = undef;my $run=$start_run-1;my $oldallsorted = undef;my $allsorted = undef;my $prev_aggregate_nbl_size = -1;while(1) {  $run++;  # run beamdecoder with option to output nbestlists  # the end result should be (1) @NBEST_LIST, a list of lists; (2) @SCORE, a list of lists of lists  print "run $run start at ".`date`;  # In case something dies later, we might wish to have a copy  create_config($___CONFIG, "./run$run.moses.ini", \%used_triples, $run, (defined$devbleu?$devbleu:"--not-estimated--"));  # skip if the user wanted  if (!$skip_decoder) {      print "($run) run decoder to produce n-best lists\n";      @order_of_lambdas_from_decoder = run_decoder(\%used_triples, $PARAMETERS, $run, \@order_of_lambdas_from_decoder, $need_to_normalize);      $need_to_normalize = 0;      safesystem("gzip -f run*out") or die "Failed to gzip run*out";  }  else {      print "skipped decoder run\n";      if (0 == scalar @order_of_lambdas_from_decoder) {        @order_of_lambdas_from_decoder = get_order_of_scores_from_nbestlist("gunzip -dc run*.best*.out.gz | head -1 |");      }      $skip_decoder = 0;      $need_to_normalize = 0;  }  my $EFF_REF_LEN = "";  if ($___AVERAGE) {     $EFF_REF_LEN = "-a";  }elsif ($___CLOSEST){     $EFF_REF_LEN = "-e";  }     my $EFF_NORM = "";  if ($___NONORM) {     $EFF_NORM = "-n";  }     # To be sure that scoring script produses these fresh:  if (-e "cands.opt"){ safesystem("\\rm -f cands.opt") or die; }  if (-e "feats.opt"){ safesystem("\\rm -f feats.opt") or die; }    # convert n-best list into a numberized format with error scores  print STDERR "Scoring the nbestlist.\n";  my $aggregate_nbl_size=0;  if (defined $obo_scorenbest) {    # Faster scoring method, never rescore previous iterations    my $cmd = "zcat run$run.best*.out.gz | $obo_scorenbest ".join(" ", @references);    my $targetfile = "run$run.feats";    if (defined $___JOBS) {      safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=$targetfile -stderr=run$run.scorenbest.err")        or die "Failed to submit scoring nbestlist to queue (via $qsubwrapper)";    } else {      safesystem("$cmd > $targetfile") or die "Failed to score nbestlist";    }    print STDERR "Combining all run*.feats\n";    $cmd = "sort -n -t: -k1,1 run*.feats | cut -d: -f2- > feats.opt";    safesystem($cmd) or die "Failed to create feats.opt";    print STDERR "Creating cands.opt\n";    open C, "cut -d: -f1 run*.feats | uniq -c |" or die "Failed to load counts from run*.feats";    my @cnts = ();    while (<C>) {      chomp;      s/^\s+//; s/\s+$//;      my ($cnt, $sent) = split /\s+/;      $aggregate_nbl_size += $cnt;      $cnts[$sent]+=$cnt;    }    close C;    print STDERR "Total candidates: $aggregate_nbl_size  in ".(scalar @cnts)." sentences\n";    die "Lost all candidates!" if $aggregate_nbl_size == 0;    open C, ">cands.opt" or die "Failed to create  cands.opt";    for (my $i=0; $i<@cnts; $i++) {      print C "$i $cnts[$i]\n";    }    close C;    } else {    # traditional scoring code    my $cmd;    if (defined $efficient_scorenbest_flag){# time-efficient sorting method of nbest lists        $oldallsorted="all.sorted.run".($run-1).".best$___N_BEST_LIST_SIZE";       $allsorted="all.sorted.run$run.best$___N_BEST_LIST_SIZE";       # Create an empty file for the first iteration       if ($run == 1){ safesystem("touch $oldallsorted"); };       if (-e $oldallsorted){ # the mert process works properly; the sorted file containing all previous nbests are already present          $cmd = "gunzip -dc run$run.best$___N_BEST_LIST_SIZE.out.gz | sort -m -n -t \"|\" -k 1,1 $oldallsorted - > $allsorted ; rm $oldallsorted ; cat $allsorted | $SCORENBESTCMD $EFF_NORM $EFF_REF_LEN ".join(" ", @references)." ./";        }       else{ # the mert process did not work properly; the sorted file containing all previous nbests is no more present; create again          $cmd = "gzip -d run*.best$___N_BEST_LIST_SIZE.out.gz ; sort -m -n -t \"|\" -k 1,1 run*.best$___N_BEST_LIST_SIZE.out > $allsorted ; gzip run*.best$___N_BEST_LIST_SIZE.out ; cat $allsorted | $SCORENBESTCMD $EFF_NORM $EFF_REF_LEN ".join(" ", @references)." ./";       }    }    else{ # traditional scoring code       $cmd = "gunzip -dc run*.best*.out.gz | sort -n -t \"|\" -k 1,1 | $SCORENBESTCMD $EFF_NORM $EFF_REF_LEN ".join(" ", @references)." ./";    }    if (defined $___JOBS) {      $cmd = "setenv PYTHONPATH $pythonpath ; $cmd";      safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -queue-parameter=\"$queue_flags\" -stdout=scorenbest.out -stderr=scorenbest.err") or die "Failed to submit scoring nbestlist to queue (via $qsubwrapper)";    } else {      safesystem($cmd) or die "Failed to score nbestlist";    }    print STDERR "Hoping that scoring succeeded. We'll see if we can read the output files now.\n";    # keep a count of lines in nbests lists (alltogether)    # if it did not increase since last iteration, we are DONE    open(IN,"cands.opt") or die "Can't read cands.opt";    while (<IN>) {      chomp;      my @flds = split / /;      $aggregate_nbl_size += $flds[1];    }    close(IN);  }  print "$aggregate_nbl_size accumulated translations\n";  print "prev accumulated translations was : $prev_aggregate_nbl_size\n";  if ($aggregate_nbl_size <= $prev_aggregate_nbl_size){     print STDERR "No new hypotheses in nbest list. Stopping.\n";     last;  }  $prev_aggregate_nbl_size = $aggregate_nbl_size;  # run cmert  # cmert reads in the file init.opt containing three lines:  #  minimum values  #  maximum values  #  current values  # We need to prepare the files and **the order of the lambdas must  # correspond to the order @order_of_lambdas_from_decoder  my @MIN = ();   # lower bounds  my @MAX = ();   # upper bounds  my @CURR = ();   # the starting values  my @NAME = ();  # to which model does the lambda belong    # walk in order of @order_of_lambdas_from_decoder and collect the min,max,val  my %visited = ();  foreach my $name (@order_of_lambdas_from_decoder) {    next if $visited{$name};    $visited{$name} = 1;	if (!defined $used_triples{$name})	{    	die "The decoder produced also some '$name' scores, but we do not know the ranges for them, no way to optimize them\n";	}      		my $count = 0;    foreach my $feature (@{$used_triples{$name}}) {			$count++;      my ($val, $min, $max) = @$feature;      push @CURR, $val;      push @MIN, $min;      push @MAX, $max;      push @NAME, $name;    }  }  open(OUT,"> init.opt") or die "Can't write init.opt (WD now $___WORKING_DIR)";  print OUT join(" ", @MIN)."\n";  print OUT join(" ", @MAX)."\n";  print OUT join(" ", @CURR)."\n";  close(OUT);  #just for brevity  open(OUT,"> names.txt") or die "Can't write names.txt (WD now $___WORKING_DIR)";  print OUT join(" ", @NAME)."\n";  close(OUT);  # make a backup copy labelled with this run number  safesystem("\\cp -f init.opt run$run.init.opt") or die;  my $DIM = scalar(@CURR); # number of lambdas  my $cmd="$cmertcmd -d $DIM";   print STDERR "Starting cmert.\n";  if (defined $___JOBS) {    $cmd="setenv SCRIPTS_ROOTDIR $SCRIPTS_ROOTDIR ; $cmd";    safesystem("$qsubwrapper $pass_old_sge -command='$cmd' -stderr=cmert.log -queue-parameter=\"$queue_flags\"") or die "Failed to start cmert (via qsubwrapper $qsubwrapper)";  } else {    safesystem("$cmd 2> cmert.log") or die "Failed to run cmert";  }  die "Optimization failed, file weights.txt does not exist or is empty"    if ! -s "weights.txt";  # backup copies  safesystem ("\\mv -f feats.opt run$run.feats.opt; gzip run$run.feats.opt; ") or die;  safesystem ("\\mv -f cands.opt run$run.cands.opt") or die;  safesystem ("\\cp -f cmert.log run$run.cmert.log") or die;  safesystem ("\\cp -f weights.txt run$run.weights.txt") or die; # this one is needed for restarts, too  if ($___ACTIVATE_FEATURES){    safesystem ("\\mv -f reduced_feats.opt run$run.reduced_feats.opt ; gzip run$run.reduced_feats.opt") or die;    safesystem ("\\mv -f reduced_init.opt run$run.reduced_init.opt") or die;    safesystem ("\\mv -f reduced_weights.txt run$run.reduced_weights.txt") or die;    safesystem ("\\mv -f reduced_cmert.log run$run.reduced_cmert.log") or die;  }  print "run $run end at ".`date`;  $bestpoint = undef;  $devbleu = undef;  open(IN,"cmert.log") or die "Can't open cmert.log";  while (<IN>) {    if (/Best point:\s*([\s\d\.\-]+?)\s*=> ([\d\.]+)/) {      $bestpoint = $1;      $devbleu = $2;      last;    }  }  close IN;  die "Failed to parse cmert.log, missed Best point there."    if !defined $bestpoint || !defined $devbleu;  print "($run) BEST at $run: $bestpoint => $devbleu at ".`date`;  my @newweights = split /\s+/, $bestpoint;  # update my cache of lambda values  store_new_lambda_values(\%used_triples, \@order_of_lambdas_from_decoder, \@newweights);  ## additional stopping criterion: weights have not changed  my $shouldstop = 1;  for(my $i=0; $i<@CURR; $i++) {    die "Lost weight! cmert reported fewer weights (@newweights) than we gave it (@CURR)"      if !defined $newweights[$i];    if (abs($CURR[$i] - $newweights[$i]) >= $minimum_required_change_in_weights) {      $shouldstop = 0;      last;    }  }  open F, "> finished_step.txt" or die "Can't mark finished step";  print F $run."\n";  close F;  if ($shouldstop) {    print STDERR "None of the weights changed more than $minimum_required_change_in_weights. Stopping.\n";    last;  }}print "Training finished at ".`date`;if (defined $allsorted){ safesystem ("\\rm -f $allsorted") or die; };safesystem("\\cp -f init.opt run$run.init.opt") or die;safesystem("\\cp -f cmert.log run$run.cmert.log") or die;create_config($___CONFIG_BAK, "./moses.ini", \%used_triples, $run, $devbleu);# just to be sure that we have the really last finished step markedopen F, "> finished_step.txt" or die "Can't mark finished step";print F $run."\n";close F;#chdir back to the original directory # useless, just to remind we were not therechdir($cwd);} # end of local scopesub store_new_lambda_values {  # given new lambda values (in given order), replace the 'val' element in our triples  my $triples = shift;  my $names = shift;  my $values = shift;  my %idx = ();  foreach my $i (0..scalar(@$values)-1) {    my $name = $names->[$i];    die "Missed name for lambda $values->[$i] (in @$values; names: @$names)"      if !defined $name;    if (!defined $idx{$name}) {      $idx{$name} = 0;    } else {      $idx{$name}++;    }    die "We did not optimize '$name', but moses returned it back to us"      if !defined $triples->{$name};
mert-moses.pl.svn-base - 源码说明

本页面展示了「moses开源的机器翻译系统」中的 mert-moses.pl.svn-base 源码文件，采用 SVN-BASE 编程语言编写，共 1,162 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与moses相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?