📄 filter-and-binarize-model-given-input.pl.svn-base
字号:
#!/usr/bin/perl -w# $Id$# Given a moses.ini file and an input text prepare minimized translation# tables and a new moses.ini, so that loading of tables is much faster.# original code by Philipp Koehn# changes by Ondrej Bojar and othersuse strict;my $MAX_LENGTH = 10;# consider phrases in input up to this length# in other words, all phrase-tables will be truncated at least to 10 words per# phrasemy $binarizer = shift; # use mosesdecoder/misc/processPhraseTable for thatmy $dir = shift; my $config = shift;my $input = shift;if (!defined $dir || !defined $config || !defined $input) { print STDERR "usage: filter-and-binarize-model-given-input.pl binarizer targetdir moses.ini input.text\n"; print STDERR "...use mosesdecoder/misc/processPhraseTable as the binarizer\n"; exit 1;}$dir = ensure_full_path($dir);# buggy directory in place?if (-d $dir && ! -e "$dir/info") { print STDERR "The directory $dir exists but does not belong to me. Delete $dir!\n"; exit(1);}# already filtered? check if it can be re-usedif (-d $dir) { my @INFO = `cat $dir/info`; chop(@INFO); if($INFO[0] ne $config || ($INFO[1] ne $input && $INFO[1].".tagged" ne $input)) { print STDERR "WARNING: directory exists but does not match parameters:\n"; print STDERR " ($INFO[0] ne $config || $INFO[1] ne $input)\n"; exit 1; } print STDERR "The filtered model was ready in $dir, not doing anything.\n"; exit 0;}# filter the translation and distortion tablessafesystem("mkdir -p $dir") or die "Can't mkdir $dir";# get tables to be filtered (and modify config file)my (@TABLE,@TABLE_WEIGHTS,@TABLE_FACTORS,@TABLE_NEW_NAME,%CONSIDER_FACTORS,%BINARIZABLE);open(INI_OUT,">$dir/moses.ini") or die "Can't write $dir/moses.ini";open(INI,$config) or die "Can't read $config";my $ttable_iterator = 0;while(<INI>) { print INI_OUT $_; if (/ttable-file\]/) { while(1) { my $table_spec = <INI>; if ($table_spec !~ /^([\d\,\-]+) ([\d\,\-]+) (\d+) (\S+)$/) { print INI_OUT $table_spec; last; } my ($source_factor,$t,$weights,$file) = ($1,$2,$3,$4); chomp($file); push @TABLE, $file; push @TABLE_WEIGHTS,$weights; $BINARIZABLE{$#TABLE}++; my $new_name = "$dir/phrase-table.$source_factor-$t-$ttable_iterator"; print INI_OUT "$source_factor $t $weights $new_name\n"; push @TABLE_NEW_NAME,$new_name; $CONSIDER_FACTORS{$source_factor} = 1; print STDERR "Considering factor $source_factor\n"; push @TABLE_FACTORS, $source_factor; $ttable_iterator++; } } elsif (/distortion-file/) { while(1) { my $table_spec = <INI>; if ($table_spec !~ /^([\d\,\-]+) (\S+) (\d+) (\S+)$/) { print INI_OUT $table_spec; last; } my ($factors,$t,$weights,$file) = ($1,$2,$3,$4); my $source_factor = $factors; $source_factor =~ s/\-\d+$//; chomp($file); push @TABLE,$file; push @TABLE_WEIGHTS,$weights; $file =~ s/^.*\/+([^\/]+)/$1/g; my $new_name = "$dir/$file"; $new_name =~ s/\.gz//; print INI_OUT "$factors $t $weights $new_name\n"; push @TABLE_NEW_NAME,$new_name; $CONSIDER_FACTORS{$source_factor} = 1; print STDERR "Considering factor $source_factor\n"; push @TABLE_FACTORS,$source_factor; } }}close(INI);close(INI_OUT);# get the phrase pairs appearing in the input text, up to the $MAX_LENGTHmy %PHRASE_USED;open(INPUT,$input) or die "Can't read $input";while(my $line = <INPUT>) { chomp($line); my @WORD = split(/ +/,$line); for(my $i=0;$i<=$#WORD;$i++) { for(my $j=0;$j<$MAX_LENGTH && $j+$i<=$#WORD;$j++) { foreach (keys %CONSIDER_FACTORS) { my @FACTOR = split(/,/); my $phrase = ""; for(my $k=$i;$k<=$i+$j;$k++) { my @WORD_FACTOR = split(/\|/,$WORD[$k]); for(my $f=0;$f<=$#FACTOR;$f++) { $phrase .= $WORD_FACTOR[$FACTOR[$f]]."|"; } chop($phrase); $phrase .= " "; } chop($phrase); $PHRASE_USED{$_}{$phrase}++; } } }}close(INPUT);# filter filesfor(my $i=0;$i<=$#TABLE;$i++) { my ($used,$total) = (0,0); my $file = $TABLE[$i]; my $factors = $TABLE_FACTORS[$i]; my $new_file = $TABLE_NEW_NAME[$i]; print STDERR "filtering $file -> $new_file...\n"; my $openstring; if ($file !~ /\.gz$/ && -e "$file.gz") { $openstring = "zcat $file.gz |"; } elsif ($file =~ /\.gz$/) { $openstring = "zcat $file |"; } else { $openstring = "< $file"; } open(FILE,$openstring) or die "Can't open '$openstring'"; open(FILE_OUT,">$new_file") or die "Can't write $new_file"; while(my $entry = <FILE>) { my ($foreign,$rest) = split(/ \|\|\| /,$entry,2); $foreign =~ s/ $//; if (defined($PHRASE_USED{$factors}{$foreign})) { print FILE_OUT $entry; $used++; } $total++; } close(FILE); close(FILE_OUT); die "No phrases found in $file!" if $total == 0; printf STDERR "$used of $total phrases pairs used (%.2f%s) - note: max length $MAX_LENGTH\n",(100*$used/$total),'%'; if ($BINARIZABLE{$i}) { print STDERR "binarizing..."; my $cmd = "cat $new_file | LC_ALL=C sort -T $dir | $binarizer -ttable 0 0 - -nscores $TABLE_WEIGHTS[$i] -out $new_file"; print STDERR $cmd."\n"; print STDERR `$cmd`; }}open(INFO,">$dir/info");print INFO "$config\n$input\n";close(INFO);print "To run the decoder, please call: moses -f $dir/moses.ini < $input\n";sub safesystem { print STDERR "Executing: @_\n"; system(@_); if ($? == -1) { print STDERR "Failed to execute: @_\n $!\n"; exit(1); } elsif ($? & 127) { printf STDERR "Execution of: @_\n died with signal %d, %s coredump\n", ($? & 127), ($? & 128) ? 'with' : 'without'; exit(1); } else { my $exitcode = $? >> 8; print STDERR "Exit code: $exitcode\n" if $exitcode; return ! $exitcode; }}sub ensure_full_path { my $PATH = shift; return $PATH if $PATH =~ /^\//; my $dir = `pawd 2>/dev/null`; if (!$dir) {$dir = `pwd`;} chomp $dir; $PATH = $dir."/".$PATH; $PATH =~ s/[\r\n]//g; $PATH =~ s/\/\.\//\//g; $PATH =~ s/\/+/\//g; my $sanity = 0; while($PATH =~ /\/\.\.\// && $sanity++<10) { $PATH =~ s/\/+/\//g; $PATH =~ s/\/[^\/]+\/\.\.\//\//g; } $PATH =~ s/\/[^\/]+\/\.\.$//; $PATH =~ s/\/+$//; return $PATH;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -