build-generation-table.perl.svn-base

来自「moses开源的机器翻译系统」· SVN-BASE 代码 · 共 117 行

SVN-BASE
117
字号
#!/usr/bin/perl# $Id$use strict;use Getopt::Long "GetOptions";my $_CORPUS;my $_OUTPUT = "generation";my $_GENERATION_FACTORS;die "specify options" unless &GetOptions('corpus=s' => \$_CORPUS,       'output=s' => \$_OUTPUT,       'generation-factors=s' => \$_GENERATION_FACTORS);die "Please use --corpus to specify the factored input corpus\n" unless $_CORPUS;if (! defined $_GENERATION_FACTORS) {  die "Please use --generation-factors to set generation factors\n";}my $___GENERATION_FACTORS = $_GENERATION_FACTORS || "0-0";die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n")  if $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;print "output=$_OUTPUT.<factor-map>\n";get_generation_factored();print "Done\n";exit 0;sub get_generation_factored {    print STDERR "(8) learn generation model @ ".`date`;    foreach my $f (split(/\+/,$___GENERATION_FACTORS)) {        my $factor = $f;        my ($factor_e_source,$factor_e) = split(/\-/,$factor);        &get_generation($factor, $factor_e_source, $factor_e);    }}sub get_generation {    my ($factor, $factor_e_source, $factor_e) = @_;    print STDERR "(8) [$factor] generate generation table @ ".`date`;    my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);    my %INCLUDE_SOURCE;    foreach my $factor (split(/,/,$factor_e_source)) {		$INCLUDE_SOURCE{$factor} = 1;    }    my %INCLUDE;    foreach my $factor (split(/,/,$factor_e)) {	$INCLUDE{$factor} = 1;    }    my (%GENERATION,%GENERATION_TOTAL_SOURCE,%GENERATION_TOTAL_TARGET);    open(E,$_CORPUS) or die "Can't read ".$_CORPUS;    while(<E>) {	chomp;	foreach (split) {	    my @FACTOR = split(/\|/);	    my ($source,$target);	    my $first_factor = 1;	    foreach my $factor (split(/,/,$factor_e_source)) {		$source .= "|" unless $first_factor;		$first_factor = 0;		$source .= $FACTOR[$factor];	    }	    $first_factor = 1;	    foreach my $factor (split(/,/,$factor_e)) {		$target .= "|" unless $first_factor;		$first_factor = 0;		$target .= $FACTOR[$factor];	    }	    	    $GENERATION{$source}{$target}++;	    $GENERATION_TOTAL_SOURCE{$source}++;	    $GENERATION_TOTAL_TARGET{$target}++;	}    }     close(E);     open(GEN,">$_OUTPUT.$factor") or die "Can't write $_OUTPUT.$factor";    foreach my $source (keys %GENERATION) {	foreach my $target (keys %{$GENERATION{$source}}) {	    printf GEN ("%s %s %.7f %.7f\n",$source,$target,			$GENERATION{$source}{$target}/$GENERATION_TOTAL_SOURCE{$source},			$GENERATION{$source}{$target}/$GENERATION_TOTAL_TARGET{$target});	}    }    close(GEN);    safesystem("rm -f $_OUTPUT.$factor.gz") or die;    safesystem("gzip $_OUTPUT.$factor") or die;}sub safesystem {  print STDERR "Executing: @_\n";  system(@_);  if ($? == -1) {      print STDERR "Failed to execute: @_\n  $!\n";      exit(1);  }  elsif ($? & 127) {      printf STDERR "Execution of: @_\n  died with signal %d, %s coredump\n",          ($? & 127),  ($? & 128) ? 'with' : 'without';  }  else {    my $exitcode = $? >> 8;    print STDERR "Exit code: $exitcode\n" if $exitcode;    return ! $exitcode;  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?