train-factored-phrase-model.perl.svn-base

来自「解码器是基于短语的统计机器翻译系统的核心模块」· SVN-BASE 代码 · 共 1,455 行 · 第 1/4 页
SVN-BASE
1,455 行
			$factor_f);	&reduce_factors($___CORPUS.".".$___E.$___CORPUS_COMPRESSION,			$___MODEL_DIR."/aligned.".$factor_e.".".$___E,			$factor_e);	&get_lexical();    }}sub get_lexical {    print STDERR "(4) [$factor] generate lexical translation table @ ".`date`;		my (%WORD_TRANSLATION,%TOTAL_FOREIGN,%TOTAL_ENGLISH);    if (-e "$___LEXICAL_DIR/lex.$factor.f2n" && -e "$___LEXICAL_DIR/lex.$factor.n2f") {      print STDERR "  reusing: $___LEXICAL_DIR/lex.$factor.f2n and $___LEXICAL_DIR/lex.$factor.n2f\n";      return;    }		&open_alignment();    while(my $e = <E>) {        if (($alignment_id++ % 1000) == 0) { print STDERR "!"; }        chomp($e);        my @ENGLISH = split(/ /,$e);        my $f = <F>; chomp($f);        my @FOREIGN = split(/ /,$f);        my $a = <A>; chomp($a);        my (%FOREIGN_ALIGNED,%ENGLISH_ALIGNED);        foreach (split(/ /,$a)) {            my ($fi,$ei) = split(/\-/);						if ($fi >= scalar(@FOREIGN) || $ei >= scalar(@ENGLISH)) {								print STDERR "alignment point ($fi,$ei) out of range (0-$#FOREIGN,0-$#ENGLISH) in line $alignment_id, ignoring\n";						}						else {								# local counts								$FOREIGN_ALIGNED{$fi}++;								$ENGLISH_ALIGNED{$ei}++;																# global counts								$WORD_TRANSLATION{$FOREIGN[$fi]}{$ENGLISH[$ei]}++;								$TOTAL_FOREIGN{$FOREIGN[$fi]}++;								$TOTAL_ENGLISH{$ENGLISH[$ei]}++;						}        }        # unaligned words        for(my $ei=0;$ei<scalar(@ENGLISH);$ei++) {          next if defined($ENGLISH_ALIGNED{$ei});          $WORD_TRANSLATION{"NULL"}{$ENGLISH[$ei]}++;          $TOTAL_ENGLISH{$ENGLISH[$ei]}++;          $TOTAL_FOREIGN{"NULL"}++;        }        for(my $fi=0;$fi<scalar(@FOREIGN);$fi++) {          next if defined($FOREIGN_ALIGNED{$fi});          $WORD_TRANSLATION{$FOREIGN[$fi]}{"NULL"}++;          $TOTAL_FOREIGN{$FOREIGN[$fi]}++;          $TOTAL_ENGLISH{"NULL"}++;        }    }		&close_alignment();    &save_word_translation(\%WORD_TRANSLATION,\%TOTAL_FOREIGN,\%TOTAL_ENGLISH);}sub open_alignment {    open(E,"$___MODEL_DIR/aligned.$factor_e.$___E")      or die "Can't read $___MODEL_DIR/aligned.$factor_e.$___E";    open(F,"$___MODEL_DIR/aligned.$factor_f.$___F")      or die "Can't read $___MODEL_DIR/aligned.$factor_f.$___F";    open(A,"$___MODEL_DIR/aligned.$___ALIGNMENT")      or die "Can't read $___MODEL_DIR/aligned.$___ALIGNMENT";    $alignment_id=0;}sub close_alignment {    print STDERR "\n";    close(A);    close(F);    close(E);}sub save_word_translation {    my ($WORD_TRANSLATION,$TOTAL_FOREIGN,$TOTAL_ENGLISH) = @_;    safesystem("mkdir -p $___LEXICAL_DIR") or die;    open(F2E,">$___LEXICAL_DIR/lex.$factor.f2n")      or die "Can't write $___LEXICAL_DIR/lex.$factor.f2n";    open(E2F,">$___LEXICAL_DIR/lex.$factor.n2f")      or die "Can't write $___LEXICAL_DIR/lex.$factor.n2f";    foreach my $f (keys %{$WORD_TRANSLATION}) {	foreach my $e (keys %{$$WORD_TRANSLATION{$f}}) {	    printf F2E "%s %s %.7f\n",$e,$f,$$WORD_TRANSLATION{$f}{$e}/$$TOTAL_FOREIGN{$f};	    printf E2F "%s %s %.7f\n",$f,$e,$$WORD_TRANSLATION{$f}{$e}/$$TOTAL_ENGLISH{$e};	}    }    close(E2F);    close(F2E);    print STDERR "Saved: $___LEXICAL_DIR/lex.$factor.f2n and $___LEXICAL_DIR/lex.$factor.n2f\n";}### (5) PHRASE EXTRACTIONsub extract_phrase_factored {    print STDERR "(5) extract phrases @ ".`date`;    my %generated;    foreach my $f (split(/\+/,"$___TRANSLATION_FACTORS"                     .($REORDERING_LEXICAL ? "+$___REORDERING_FACTORS" : ""))) {        # we extract phrases for all translation steps and also for reordering factors (if lexicalized reordering is used)        next if $generated{$f};        $generated{$f} = 1;	$factor = $f;	($factor_f,$factor_e) = split(/\-/,$factor);	&extract_phrase();    }}sub extract_phrase {    print STDERR "(5) [$factor] extract phrases @ ".`date`;    my $cmd = "$PHRASE_EXTRACT $___MODEL_DIR/aligned.$factor_e.$___E $___MODEL_DIR/aligned.$factor_f.$___F $___MODEL_DIR/aligned.$___ALIGNMENT $___EXTRACT_FILE.$factor $___MAX_PHRASE_LENGTH orientation";    print STDERR "$cmd\n";    safesystem("$cmd") or die "Phrase extraction failed (missing input files?)";    safesystem("cat $___EXTRACT_FILE.$factor.o.part* > $___EXTRACT_FILE.$factor.o") or die;    safesystem("rm -f $___EXTRACT_FILE.$factor.o.gz") or die;    safesystem("gzip $___EXTRACT_FILE.$factor.o") or die;    if (! $debug) { safesystem("rm -f $___EXTRACT_FILE.$factor.o.part*") or die;}    safesystem("cat $___EXTRACT_FILE.$factor.part* > $___EXTRACT_FILE.$factor") or die;    if (! $debug) { safesystem("rm -f $___EXTRACT_FILE.$factor.part*") or die;}    safesystem("cat $___EXTRACT_FILE.$factor.inv.part* > $___EXTRACT_FILE.$factor.inv") or die;    if (! $debug) { safesystem("rm -f $___EXTRACT_FILE.$factor.inv.part*") or die;}}### (6) PHRASE SCORINGsub score_phrase_factored {    print STDERR "(6) score phrases @ ".`date`;    foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {	$factor = $f;	($factor_f,$factor_e) = split(/\-/,$factor);	&score_phrase();    }}sub score_phrase {    print STDERR "(6) [$factor] score phrases @ ".`date`;    if (-e "$___EXTRACT_FILE.$factor.gz") {      safesystem("gunzip < $___EXTRACT_FILE.$factor.gz > $___EXTRACT_FILE.$factor") or die;    }    if (-e "$___EXTRACT_FILE.$factor.inv.gz") {      safesystem("gunzip < $___EXTRACT_FILE.$factor.inv.gz > $___EXTRACT_FILE.$factor.inv") or die;    }    print STDERR "(6.1) [$factor]  sorting @ ".`date`;    # print "LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor > $___EXTRACT_FILE.$factor.sorted\n";    safesystem("LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor > $___EXTRACT_FILE.$factor.sorted") or die;    safesystem("rm -f $___EXTRACT_FILE.$factor.gz") or die;    safesystem("gzip $___EXTRACT_FILE.$factor") or die;    print STDERR "(6.2) [$factor]  sorting inv @ ".`date`;    # print "LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor.inv > $___EXTRACT_FILE.$factor.inv.sorted\n";    safesystem("LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor.inv > $___EXTRACT_FILE.$factor.inv.sorted") or die;    safesystem("rm -f $___EXTRACT_FILE.$factor.inv.gz") or die;    safesystem("gzip $___EXTRACT_FILE.$factor.inv") or die;    for my $direction ("f2n","n2f") {	print STDERR "(6.3) [$factor]  creating table half $___MODEL_DIR/phrase-table-half.$factor.$direction @ ".`date`;	my $extract = "$___EXTRACT_FILE.$factor.sorted";	$extract = "$___EXTRACT_FILE.$factor.inv.sorted" if $direction eq "n2f";	my $inverse = "";	$inverse = " inverse" if $direction eq "n2f";	my $part_count = &split_extract($extract);	for(my $i=0;$i<$part_count;$i++) {	    my $part = sprintf("%04d",$i);	    print "$PHRASE_SCORE $extract.part$part $___LEXICAL_DIR/lex.$factor.$direction $___MODEL_DIR/phrase-table-half.$factor.$direction.part$part $inverse\n";	    safesystem("$PHRASE_SCORE $extract.part$part $___LEXICAL_DIR/lex.$factor.$direction $___MODEL_DIR/phrase-table-half.$factor.$direction.part$part $inverse")              or die "Scoring of phrases failed";	    if (! $debug) { safesystem("rm $extract.part$part") or die;}	}	safesystem("cat $___MODEL_DIR/phrase-table-half.$factor.$direction.part* >$___MODEL_DIR/phrase-table-half.$factor.$direction") or die;    }    print STDERR "(6.4) [$factor]  sorting inverse n2f table@ ".`date`;    safesystem("LC_ALL=C sort -T $___MODEL_DIR $___MODEL_DIR/phrase-table-half.$factor.n2f > $___MODEL_DIR/phrase-table-half.$factor.n2f.sorted") or die;    print STDERR "(6.5) [$factor]  consolidating the two halves @ ".`date`;    open(F2N,"$___MODEL_DIR/phrase-table-half.$factor.f2n")      or die "Can't read $___MODEL_DIR/phrase-table-half.$factor.f2n";    open(N2F,"$___MODEL_DIR/phrase-table-half.$factor.n2f.sorted")      or die "Can't read $___MODEL_DIR/phrase-table-half.$factor.n2f.sorted";    open(TABLE,">$___MODEL_DIR/phrase-table.$factor")      or die "Can't write $___MODEL_DIR/phrase-table.$factor";    my $i=0;    my $mismatch = 0;    while(my $f2n = <F2N>) {	$i++;	my $n2f = <N2F>;	my ($english,$foreign,$p) = split(/ \|\|\| /,$n2f); chop($p);	my ($english2,$foreign2,$p2) = split(/ \|\|\| /,$f2n); chop($p2);	if ($english ne $english2 || $foreign ne $foreign2) {	    print STDERR "mismatch line $i: ($english ne $english2 || $foreign ne $foreign2)\n";            $mismatch++;            last if $mismatch > 10;	    next;	}	print TABLE "$english ||| $foreign ||| $p $p2 2.718\n";    }    close(N2F);    close(F2N);    die "There were mismatches! (printed only first 10)" if $mismatch;    if (! $debug) { safesystem("rm -f $___MODEL_DIR/phrase-table-half.$factor.*") or die;}    if (! $debug) { safesystem("rm -f $___MODEL_DIR/extract*sorted*") or die;}    safesystem("rm -f $___MODEL_DIR/phrase-table.$factor.gz") or die;    safesystem("gzip $___MODEL_DIR/phrase-table.$factor") or die;}sub split_extract {    my ($file) = @_;    my $i=0;    my $part = 1;    my $split_when_possible = 0;    my ($first,$dummy);    my $partfname = sprintf("%s.part%04d",$file,0);    open(PART,">$partfname") or die "Can't write $partfname";    open(EXTRACT,$file) or die "Can't read $file";    while(<EXTRACT>) {	if ($i>0 && $i % 10000000 == 0) {	    $split_when_possible = 1;	    ($first,$dummy) = split(/ \|\|\| /);	}	elsif ($split_when_possible) {	    my ($f,$dummy) = split(/ \|\|\| /);	    if ($f ne $first) {		close(PART) if $i;                my $partfname = sprintf("%s.part%04d",$file,$part);		open(PART,">$partfname") or die "Can't write $partfname";		$split_when_possible = 0;		$part++;	    }	}	print PART $_;	$i++;    }    close(EXTRACT);    return $part;}### (7) LEARN REORDERING MODELsub get_reordering_factored {    print STDERR "(7) learn reordering model @ ".`date`;    if ($REORDERING_LEXICAL) {      foreach my $f (split(/\+/,$___REORDERING_FACTORS)) {	  $factor = $f;	  ($factor_f,$factor_e) = split(/\-/,$factor);	  &get_reordering();      }    } else {      print STDERR "  ... skipping this step, reordering is not lexicalized ...\n";    }}sub get_reordering {    print STDERR "(7) [$factor] learn reordering model @ ".`date`;    print STDERR "(7.1) [$factor]  sorting extract.o @ ".`date`;    if (-e "$___EXTRACT_FILE.$factor.o.gz") {      safesystem("gunzip $___EXTRACT_FILE.$factor.o.gz") or die;    }    # print "LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor.o > $___EXTRACT_FILE.$factor.o.sorted\n";    safesystem("LC_ALL=C sort -T $___MODEL_DIR $___EXTRACT_FILE.$factor.o > $___EXTRACT_FILE.$factor.o.sorted") or die;    safesystem("rm -f $___EXTRACT_FILE.$factor.o.gz") or die;    safesystem("gzip $___EXTRACT_FILE.$factor.o") or die;    my $smooth = $___REORDERING_SMOOTH;    my @REORDERING_SMOOTH_PREVIOUS = ($smooth,$smooth,$smooth);    my @REORDERING_SMOOTH_FOLLOWING = ($smooth,$smooth,$smooth);    my (%SMOOTH_PREVIOUS,%SMOOTH_FOLLOWING);    if ($smooth =~ /(.+)u$/) {	$smooth = $1;	my $smooth_total = 0; 	open(O,"$___EXTRACT_FILE.$factor.o.sorted")          or die "Can't read $___EXTRACT_FILE.$factor.o.sorted";	while(<O>) {	    chomp;	    my ($f,$e,$o) = split(/ \|\|\| /);	    my ($o_previous,$o_following) = split(/ /,$o);	    $SMOOTH_PREVIOUS{$o_previous}++;	    $SMOOTH_FOLLOWING{$o_following}++;	    $smooth_total++;	}	close(O);	@REORDERING_SMOOTH_PREVIOUS = ($smooth*($SMOOTH_PREVIOUS{"mono"}+0.1)/$smooth_total,				       $smooth*($SMOOTH_PREVIOUS{"swap"}+0.1)/$smooth_total,				       $smooth*($SMOOTH_PREVIOUS{"other"}+0.1)/$smooth_total);	@REORDERING_SMOOTH_FOLLOWING = ($smooth*($SMOOTH_FOLLOWING{"mono"}+0.1)/$smooth_total,					$smooth*($SMOOTH_FOLLOWING{"swap"}+0.1)/$smooth_total,					$smooth*($SMOOTH_FOLLOWING{"other"}+0.1)/$smooth_total);	printf "$smooth*($SMOOTH_FOLLOWING{mono}+0.1)/$smooth_total,					$smooth*($SMOOTH_FOLLOWING{swap}+0.1)/$smooth_total,					$smooth*($SMOOTH_FOLLOWING{other}+0.1)/$smooth_total\n";	printf "smoothed following to %f,%f,%f\n",@REORDERING_SMOOTH_FOLLOWING;    }        ($mono_previous_f,$swap_previous_f,$other_previous_f) = @REORDERING_SMOOTH_PREVIOUS;    ($mono_previous_fe,$swap_previous_fe,$other_previous_fe) = @REORDERING_SMOOTH_PREVIOUS;    ($mono_following_f,$swap_following_f,$other_following_f) = @REORDERING_SMOOTH_FOLLOWING;    ($mono_following_fe,$swap_following_fe,$other_following_fe) = @REORDERING_SMOOTH_FOLLOWING;    print STDERR "(7.2) building tables @ ".`date`;    open(O,"$___EXTRACT_FILE.$factor.o.sorted")      or die "Can't read $___EXTRACT_FILE.$factor.o.sorted";    open(OF,  "|gzip >$___MODEL_DIR/orientation-table.$factor.f.$___REORDERING_SMOOTH.gz") 	if defined($REORDERING_MODEL{"orientation-f"});    open(OFE, "|gzip >$___MODEL_DIR/orientation-table.$factor.fe.$___REORDERING_SMOOTH.gz") 	if defined($REORDERING_MODEL{"orientation-fe"});    open(OBF, "|gzip >$___MODEL_DIR/orientation-table.$factor.bi.f.$___REORDERING_SMOOTH.gz") 	if defined($REORDERING_MODEL{"orientation-bidirectional-f"});    open(OBFE,"|gzip >$___MODEL_DIR/orientation-table.$factor.bi.fe.$___REORDERING_SMOOTH.gz") 	if defined($REORDERING_MODEL{"orientation-bidirectional-fe"});    open(MF,  "|gzip >$___MODEL_DIR/monotonicity-table.$factor.f.$___REORDERING_SMOOTH.gz") 	if defined($REORDERING_MODEL{"monotonicity-f"});    open(MFE, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.fe.$___REORDERING_SMOOTH.gz") 	if defined($REORDERING_MODEL{"monotonicity-fe"});    open(MBF, "|gzip >$___MODEL_DIR/monotonicity-table.$factor.bi.f.$___REORDERING_SMOOTH.gz") 	if defined($REORDERING_MODEL{"monotonicity-bidirectional-f"});    open(MBFE,"|gzip >$___MODEL_DIR/monotonicity-table.$factor.bi.fe.$___REORDERING_SMOOTH.gz") 	if defined($REORDERING_MODEL{"monotonicity-bidirectional-fe"});    my $first = 1;    while(<O>) {	chomp;	my ($f,$e,$o) = split(/ \|\|\| /);	my ($o_previous,$o_following) = split(/ /,$o);		# store counts if new f,e	if ($first) {	    $f_current = $f;	    $e_current = $e;	    $first = 0;	}	elsif ($f ne $f_current || $e ne $e_current) {	    	    if (defined($REORDERING_MODEL{"fe"})) {		# compute probs, store them		&store_reordering_fe();				# reset counters		($mono_previous_fe,$swap_previous_fe,$other_previous_fe) = @REORDERING_SMOOTH_PREVIOUS;		($mono_following_fe,$swap_following_fe,$other_following_fe) = @REORDERING_SMOOTH_FOLLOWING;	    }	    # store counts if new f	    if ($f ne $f_current && defined($REORDERING_MODEL{"f"})) {				# compute probs, store them		&store_reordering_f();				# reset counters		($mono_previous_f,$swap_previous_f,$other_previous_f) = @REORDERING_SMOOTH_PREVIOUS;		($mono_following_f,$swap_following_f,$other_following_f) = @REORDERING_SMOOTH_FOLLOWING;			    }	    $f_current = $f;	    $e_current = $e;	}		# update counts	if    ($o_previous eq 'mono') {  $mono_previous_f++;  $mono_previous_fe++; }	elsif ($o_previous eq 'swap') {  $swap_previous_f++;  $swap_previous_fe++; }	elsif ($o_previous eq 'other'){ $other_previous_f++; $other_previous_fe++; }	else { print STDERR "buggy line (o_previous:$o_previous): $_\n"; }		if    ($o_following eq 'mono') {  $mono_following_f++;  $mono_following_fe++; }
train-factored-phrase-model.perl.svn-base - 源码说明

本页面展示了「解码器是基于短语的统计机器翻译系统的核心模块」中的 train-factored-phrase-model.perl.svn-base 源码文件，采用 SVN-BASE 编程语言编写，共 1,455 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与解码器相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?