📄 train-factored-phrase-model.perl.svn-base
字号:
#!/usr/bin/perl -wuse strict;use Getopt::Long "GetOptions";# Train Factored Phrase Model# (c) 2006-2007 Philipp Koehn# with contributions from other JHU WS participants# Train a phrase model from a parallel corpus# -----------------------------------------------------$ENV{"LC_ALL"} = "C";my($_ROOT_DIR,$_CORPUS_DIR,$_GIZA_E2F,$_GIZA_F2E,$_MODEL_DIR,$_CORPUS,$_CORPUS_COMPRESSION,$_FIRST_STEP,$_LAST_STEP,$_F,$_E,$_MAX_PHRASE_LENGTH,$_LEXICAL_FILE,$_NO_LEXICAL_WEIGHTING,$_VERBOSE,$_ALIGNMENT,$_ALIGNMENT_FILE,@_LM,$_EXTRACT_FILE,$_GIZA_OPTION,$_HELP,$_PARTS,$_DIRECTION,$_ONLY_PRINT_GIZA,$_GIZA_EXTENSION,$_REORDERING,$_REORDERING_SMOOTH,$_INPUT_FACTOR_MAX,$_ALIGNMENT_FACTORS,$_TRANSLATION_FACTORS,$_REORDERING_FACTORS,$_GENERATION_FACTORS,$_DECODING_STEPS,$_PARALLEL, $SCRIPTS_ROOTDIR, $_FACTOR_DELIMITER,@_PHRASE_TABLE,@_REORDERING_TABLE,@_GENERATION_TABLE,$_CONFIG,$_DONT_ZIP,$_HMM_ALIGN,@_GENERATION_TYPE);my $debug = 0; # debug this script, do not delete any files in debug modemy $nodebug = 1; # no debug this script, delete any files# the following line is set installation time by 'make release'. BEWARE!my $BINDIR="/home/s0565741/terabyte/bin";$_HELP = 1 unless &GetOptions('root-dir=s' => \$_ROOT_DIR, 'bin-dir=s' => \$BINDIR, # allow to override default bindir path 'corpus-dir=s' => \$_CORPUS_DIR, 'corpus=s' => \$_CORPUS, 'corpus-compression=s' => \$_CORPUS_COMPRESSION, 'f=s' => \$_F, 'e=s' => \$_E, 'giza-e2f=s' => \$_GIZA_E2F, 'giza-f2e=s' => \$_GIZA_F2E, 'giza-extension=s' => \$_GIZA_EXTENSION, 'max-phrase-length=i' => \$_MAX_PHRASE_LENGTH, 'lexical-file=s' => \$_LEXICAL_FILE, 'no-lexical-weighting' => \$_NO_LEXICAL_WEIGHTING, 'model-dir=s' => \$_MODEL_DIR, 'extract-file=s' => \$_EXTRACT_FILE, 'alignment=s' => \$_ALIGNMENT, 'alignment-file=s' => \$_ALIGNMENT_FILE, 'verbose' => \$_VERBOSE, 'first-step=i' => \$_FIRST_STEP, 'last-step=i' => \$_LAST_STEP, 'giza-option=s' => \$_GIZA_OPTION, 'parallel' => \$_PARALLEL, 'lm=s' => \@_LM, 'help' => \$_HELP, 'hmm-align' => \$_HMM_ALIGN, 'debug' => \$debug, 'nodebug' => \$nodebug, 'dont-zip' => \$_DONT_ZIP, 'parts=i' => \$_PARTS, 'direction=i' => \$_DIRECTION, 'only-print-giza' => \$_ONLY_PRINT_GIZA, 'reordering=s' => \$_REORDERING, 'reordering-smooth=s' => \$_REORDERING_SMOOTH, 'input-factor-max=i' => \$_INPUT_FACTOR_MAX, 'alignment-factors=s' => \$_ALIGNMENT_FACTORS, 'translation-factors=s' => \$_TRANSLATION_FACTORS, 'reordering-factors=s' => \$_REORDERING_FACTORS, 'generation-factors=s' => \$_GENERATION_FACTORS, 'decoding-steps=s' => \$_DECODING_STEPS, 'scripts-root-dir=s' => \$SCRIPTS_ROOTDIR, 'factor-delimiter=s' => \$_FACTOR_DELIMITER, 'phrase-translation-table=s' => \@_PHRASE_TABLE, 'generation-table=s' => \@_GENERATION_TABLE, 'reordering-table=s' => \@_REORDERING_TABLE, 'generation-type=s' => \@_GENERATION_TYPE, 'config=s' => \$_CONFIG );if ($nodebug){ $debug = 0; };if ($_HELP) { print "Train Phrase ModelSteps: (--first-step to --last-step)(1) prepare corpus(2) run GIZA(3) align words(4) learn lexical translation(5) extract phrases(6) score phrases(7) learn reordering model(8) learn generation model(9) create decoder config fileFor more, please check manual or contact koehn\@inf.ed.ac.uk\n"; exit(1);}my $___FACTOR_DELIMITER = $_FACTOR_DELIMITER;$___FACTOR_DELIMITER = '|' unless ($_FACTOR_DELIMITER);if (!defined $SCRIPTS_ROOTDIR) { $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"}; die "Please set SCRIPTS_ROOTDIR or specify --scripts-root-dir" if !defined $SCRIPTS_ROOTDIR;}print STDERR "Using SCRIPTS_ROOTDIR: $SCRIPTS_ROOTDIR\n";# supporting binaries from other packagesmy $GIZA = "$BINDIR/GIZA++";my $SNT2COOC = "$BINDIR/snt2cooc.out"; my $MKCLS = "$BINDIR/mkcls";# supporting scripts/binaries from this packagemy $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";my $SYMAL = "$SCRIPTS_ROOTDIR/training/symal/symal";my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl";my $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/training/phrase-extract/score";# utilitiesmy $ZCAT = "zcat";my $BZCAT = "bzcat";# do a sanity check to make sure we can find the necessary binaries since# these are not installed by defaultdie("Cannot find mkcls, GIZA++, & snt2cooc.out in $BINDIR.\nDid you install this script using 'make release'?") unless (-x $GIZA && -x $SNT2COOC && -x $MKCLS);# set varibles to defaults or from optionsmy $___ROOT_DIR = ".";$___ROOT_DIR = $_ROOT_DIR if $_ROOT_DIR;my $___CORPUS_DIR = $___ROOT_DIR."/corpus";$___CORPUS_DIR = $_CORPUS_DIR if $_CORPUS_DIR;die("use --corpus to specify corpus") unless $_CORPUS || ($_FIRST_STEP && $_FIRST_STEP>1 && $_FIRST_STEP!=8);my $___CORPUS = $_CORPUS;my $___GIZA_EXTENSION = 'A3.final';$___GIZA_EXTENSION = 'Ahmm.5' if $_HMM_ALIGN;$___GIZA_EXTENSION = $_GIZA_EXTENSION if $_GIZA_EXTENSION;my $___CORPUS_COMPRESSION = '';if ($_CORPUS_COMPRESSION) { $___CORPUS_COMPRESSION = ".$_CORPUS_COMPRESSION";}# foreign/English language extensiondie("use --f to specify foreign language") unless $_F;die("use --e to specify English language") unless $_E;my $___F = $_F;my $___E = $_E;# vocabulary files in corpus dirmy $___VCB_E = $___CORPUS_DIR."/".$___E.".vcb";my $___VCB_F = $___CORPUS_DIR."/".$___F.".vcb";# GIZA generated filesmy $___GIZA = $___ROOT_DIR."/giza";my $___GIZA_E2F = $___GIZA.".".$___E."-".$___F;my $___GIZA_F2E = $___GIZA.".".$___F."-".$___E;$___GIZA_E2F = $_GIZA_E2F if $_GIZA_E2F;$___GIZA_F2E = $_GIZA_F2E if $_GIZA_F2E;my $___GIZA_OPTION = "";$___GIZA_OPTION = $_GIZA_OPTION if $_GIZA_OPTION;# alignment heuristicmy $___ALIGNMENT = "grow-diag-final";$___ALIGNMENT = $_ALIGNMENT if $_ALIGNMENT;my $___NOTE_ALIGNMENT_DROPS = 1;# model dir and alignment/extract filemy $___MODEL_DIR = $___ROOT_DIR."/model";$___MODEL_DIR = $_MODEL_DIR if $_MODEL_DIR;my $___ALIGNMENT_FILE = "$___MODEL_DIR/aligned";$___ALIGNMENT_FILE = $_ALIGNMENT_FILE if $_ALIGNMENT_FILE;my $___EXTRACT_FILE = $___MODEL_DIR."/extract";$___EXTRACT_FILE = $_EXTRACT_FILE if $_EXTRACT_FILE;my $___CONFIG = $___MODEL_DIR."/moses.ini";$___CONFIG = $_CONFIG if $_CONFIG;my $___DONT_ZIP = 0; $_DONT_ZIP = $___DONT_ZIP unless $___DONT_ZIP;my $___MAX_PHRASE_LENGTH = 7;my $___LEXICAL_WEIGHTING = 1;my $___LEXICAL_FILE = $___MODEL_DIR."/lex";$___MAX_PHRASE_LENGTH = $_MAX_PHRASE_LENGTH if $_MAX_PHRASE_LENGTH;$___LEXICAL_WEIGHTING = 0 if $_NO_LEXICAL_WEIGHTING;$___LEXICAL_FILE = $_LEXICAL_FILE if $_LEXICAL_FILE;my $___VERBOSE = 0;my $___FIRST_STEP = 1;my $___LAST_STEP = 9;$___VERBOSE = $_VERBOSE if $_VERBOSE;$___FIRST_STEP = $_FIRST_STEP if $_FIRST_STEP;$___LAST_STEP = $_LAST_STEP if $_LAST_STEP;my @___LM = ();if ($___LAST_STEP == 9) { die "use --lm factor:order:filename to specify at least one language model" if scalar @_LM == 0; foreach my $lm (@_LM) { my $type = 0; # default to srilm my ($f, $order, $filename); ($f, $order, $filename, $type) = split /:/, $lm, 4; $type = 0 unless $type; die "Wrong format of --lm. Expected: --lm factor:order:filename" if $f !~ /^[0-9]+$/ || $order !~ /^[0-9]+$/ || !defined $filename; die "Language model file not found or empty: $filename" if ! -s $filename; push @___LM, [ $f, $order, $filename, $type ]; }}my $___PARTS = 1;$___PARTS = $_PARTS if $_PARTS;my $___DIRECTION = 0;$___DIRECTION = $_DIRECTION if $_DIRECTION;# don't forkmy $___NOFORK = !defined $_PARALLEL;my $___ONLY_PRINT_GIZA = 0;$___ONLY_PRINT_GIZA = 1 if $_ONLY_PRINT_GIZA;# Reordering model (esp. lexicalized)my $___REORDERING = "distance";$___REORDERING = $_REORDERING if $_REORDERING;my $___REORDERING_SMOOTH = 0.5;$___REORDERING_SMOOTH = $_REORDERING_SMOOTH if $_REORDERING_SMOOTH;my %REORDERING_MODEL;my $REORDERING_LEXICAL = 0; # flag for building lexicalized reordering modelsforeach my $r (split(/,/,$___REORDERING)) { $r =~ s/orientation/msd/; if (!( $r eq "msd-f" || $r eq "msd-fe" || $r eq "msd-bidirectional-f" || $r eq "msd-bidirectional-fe" || $r eq "monotonicity-f" || $r eq "monotonicity-fe" || $r eq "monotonicity-bidirectional-f" || $r eq "monotonicity-bidirectional-fe" || $r eq "distance")) { print STDERR "unknown reordering type: $r"; exit(1); } if ($r ne "distance") { $REORDERING_LEXICAL = 1; } $REORDERING_MODEL{$r}++; if ($r =~ /-f$/) { $REORDERING_MODEL{"f"}++; } if ($r =~ /-fe$/) { $REORDERING_MODEL{"fe"}++; }}my ($mono_previous_f,$swap_previous_f,$other_previous_f);my ($mono_previous_fe,$swap_previous_fe,$other_previous_fe);my ($mono_following_f,$swap_following_f,$other_following_f);my ($mono_following_fe,$swap_following_fe,$other_following_fe);my ($f_current,$e_current);### Factored translation modelsmy $___ALIGNMENT_FACTORS = "0-0";$___ALIGNMENT_FACTORS = $_ALIGNMENT_FACTORS if defined($_ALIGNMENT_FACTORS);die("format for alignment factors is \"0-0\" or \"0,1,2-0,1\", you provided $___ALIGNMENT_FACTORS\n") if $___ALIGNMENT_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*$/;my $___TRANSLATION_FACTORS = undef;$___TRANSLATION_FACTORS = "0-0" unless defined($_DECODING_STEPS); # single factor default$___TRANSLATION_FACTORS = $_TRANSLATION_FACTORS if defined($_TRANSLATION_FACTORS);die("format for translation factors is \"0-0\" or \"0-0+1-1\" or \"0-0+0,1-0,1\", you provided $___TRANSLATION_FACTORS\n") if defined $___TRANSLATION_FACTORS && $___TRANSLATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;my $___REORDERING_FACTORS = undef;$___REORDERING_FACTORS = "0-0" if defined($_REORDERING) && ! defined($_DECODING_STEPS); # single factor default$___REORDERING_FACTORS = $_REORDERING_FACTORS if defined($_REORDERING_FACTORS);die("format for reordering factors is \"0-0\" or \"0-0+1-1\" or \"0-0+0,1-0,1\", you provided $___REORDERING_FACTORS\n") if defined $___REORDERING_FACTORS && $___REORDERING_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;my $___GENERATION_FACTORS = undef;$___GENERATION_FACTORS = $_GENERATION_FACTORS if defined($_GENERATION_FACTORS);die("format for generation factors is \"0-1\" or \"0-1+0-2\" or \"0-1+0,1-1,2\", you provided $___GENERATION_FACTORS\n") if defined $___GENERATION_FACTORS && $___GENERATION_FACTORS !~ /^\d+(\,\d+)*\-\d+(\,\d+)*(\+\d+(\,\d+)*\-\d+(\,\d+)*)*$/;my $___DECODING_STEPS = "t0";$___DECODING_STEPS = $_DECODING_STEPS if defined($_DECODING_STEPS);die("format for decoding steps is \"t0,g0,t1,g1:t2\", you provided $___DECODING_STEPS\n") if defined $_DECODING_STEPS && $_DECODING_STEPS !~ /^[tg]\d+(,[tg]\d+)*(:[tg]\d+(,[tg]\d+)*)*$/;my ($factor,$factor_e,$factor_f);my $alignment_id;### MAIN&prepare() if $___FIRST_STEP==1;&run_giza() if $___FIRST_STEP<=2 && $___LAST_STEP>=2;&word_align() if $___FIRST_STEP<=3 && $___LAST_STEP>=3;&get_lexical_factored() if $___FIRST_STEP<=4 && $___LAST_STEP>=4;&extract_phrase_factored() if $___FIRST_STEP<=5 && $___LAST_STEP>=5;&score_phrase_factored() if $___FIRST_STEP<=6 && $___LAST_STEP>=6;&get_reordering_factored() if $___FIRST_STEP<=7 && $___LAST_STEP>=7;&get_generation_factored() if $___FIRST_STEP<=8 && $___LAST_STEP>=8;&create_ini() if $___LAST_STEP==9;### (1) PREPARE CORPUSsub prepare { print STDERR "(1) preparing corpus @ ".`date`; safesystem("mkdir -p $___CORPUS_DIR") or die; print STDERR "(1.0) selecting factors @ ".`date`; my ($factor_f,$factor_e) = split(/\-/,$___ALIGNMENT_FACTORS); my $corpus = $___CORPUS.".".$___ALIGNMENT_FACTORS; if ($___NOFORK) { &reduce_factors($___CORPUS.".".$___F.$___CORPUS_COMPRESSION,$corpus.".".$___F,$factor_f); &reduce_factors($___CORPUS.".".$___E.$___CORPUS_COMPRESSION,$corpus.".".$___E,$factor_e); &make_classes($corpus.".".$___F,$___VCB_F.".classes"); &make_classes($corpus.".".$___E,$___VCB_E.".classes"); my $VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F); my $VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E); &numberize_txt_file($VCB_F,$corpus.".".$___F, $VCB_E,$corpus.".".$___E, $___CORPUS_DIR."/$___F-$___E-int-train.snt"); &numberize_txt_file($VCB_E,$corpus.".".$___E, $VCB_F,$corpus.".".$___F, $___CORPUS_DIR."/$___E-$___F-int-train.snt"); } else { print "Forking...\n"; my $pid = fork(); die "couldn't fork" unless defined $pid; if (!$pid) { &reduce_factors($___CORPUS.".".$___F.$___CORPUS_COMPRESSION,$corpus.".".$___F,$factor_f); exit 0; } else { &reduce_factors($___CORPUS.".".$___E.$___CORPUS_COMPRESSION,$corpus.".".$___E,$factor_e); } printf "Waiting for second reduce_factors process...\n"; waitpid($pid, 0); my $pid2 = 0; $pid = fork(); die "couldn't fork" unless defined $pid; if (!$pid) { &make_classes($corpus.".".$___F,$___VCB_F.".classes"); exit 0; } # parent $pid2 = fork(); die "couldn't fork again" unless defined $pid2; if (!$pid2) { #child &make_classes($corpus.".".$___E,$___VCB_E.".classes"); exit 0; } my $VCB_F = &get_vocabulary($corpus.".".$___F,$___VCB_F); my $VCB_E = &get_vocabulary($corpus.".".$___E,$___VCB_E); &numberize_txt_file($VCB_F,$corpus.".".$___F, $VCB_E,$corpus.".".$___E, $___CORPUS_DIR."/$___F-$___E-int-train.snt"); &numberize_txt_file($VCB_E,$corpus.".".$___E, $VCB_F,$corpus.".".$___F, $___CORPUS_DIR."/$___E-$___F-int-train.snt"); printf "Waiting for mkcls processes to finish...\n"; waitpid($pid2, 0); waitpid($pid, 0); }}sub reduce_factors { my ($full,$reduced,$factors) = @_; print STDERR "(1.0.5) reducing factors to produce $reduced @ ".`date`; while(-e $reduced.".lock") { sleep(10); } if (-e $reduced) { print STDERR " $reduced in place, reusing\n"; return; } `touch $reduced.lock`; # my %INCLUDE; # foreach my $factor (split(/,/,$factors)) { # $INCLUDE{$factor} = 1; # } my @INCLUDE = sort {$a <=> $b} split(/,/,$factors); my $read = $full; if ($full =~ /\.bz2$/) { $read = "$BZCAT $full|"; } elsif ($full =~ /\.gz$/) { $read = "$ZCAT $full|"; } open(IN,$read) or die "Can't read $full ($read)"; open(OUT,">".$reduced) or die "Can't write $reduced"; my $nr = 0; while(<IN>) { $nr++; print STDERR "." if $nr % 10000 == 0; print STDERR "($nr)" if $nr % 100000 == 0; chomp; s/ +/ /g; s/^ //; s/ $//; my $first = 1; foreach (split) { my @FACTOR = split /\Q$___FACTOR_DELIMITER/; # \Q causes to disable metacharacters in regex print OUT " " unless $first; $first = 0; my $first_factor = 1; foreach my $outfactor (@INCLUDE) { print OUT "|" unless $first_factor; $first_factor = 0;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -