📄 detokenizer.perl.svn-base

📁 moses开源的机器翻译系统
💻 SVN-BASE
字号:
#!/usr/bin/perl -w# Sample De-Tokenizer# written by Josh Schroeder, based on code by Philipp Koehnbinmode(STDIN, ":utf8");binmode(STDOUT, ":utf8");use strict;my $language = "en";my $QUIET = 0;my $HELP = 0;while (@ARGV) {	$_ = shift;	/^-l$/ && ($language = shift, next);	/^-q$/ && ($QUIET = 1, next);	/^-h$/ && ($HELP = 1, next);}if ($HELP) {	print "Usage ./detokenizer.perl (-l [en|de|...]) < tokenizedfile > detokenizedfile\n";	exit;}die "No built-in rules for language $language, claim en for default behaviour."	if $language !~ /^(cs|en|fr)$/;if (!$QUIET) {	print STDERR "Detokenizer Version ".'$Revision$'."\n";	print STDERR "Language: $language\n";}while(<STDIN>) {	if (/^<.+>$/ || /^\s*$/) {		#don't try to detokenize XML/HTML tag lines		print $_;	}	else {		print &detokenize($_);	}}sub detokenize {	my($text) = @_;	chomp($text);	$text = " $text ";		my $word;	my $i;	my @words = split(/ /,$text);	$text = "";	my %quoteCount =  ("\'"=>0,"\""=>0);	my $prependSpace = " ";	for ($i=0;$i<(scalar(@words));$i++) {				if ($words[$i] =~ /^[\p{IsSc}\(\[\{\驴\隆]+$/) {			#perform right shift on currency and other random punctuation items			$text = $text.$prependSpace.$words[$i];			$prependSpace = "";		} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){			#perform left shift on punctuation items			$text=$text.$words[$i];			$prependSpace = " ";		} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {			#left-shift the contraction for English			$text=$text.$words[$i];			$prependSpace = " ";		} elsif (($language eq "fr") && ($i<(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {			#right-shift the contraction for French			$text = $text.$prependSpace.$words[$i];			$prependSpace = "";		} elsif (($language eq "cs") && ($i<(scalar(@words)-3))				&& ($words[$i] =~ /[\p{IsAlpha}]$/)				&& ($words[$i+1] =~ /^[-鈥揮$/)				&& ($words[$i+2] =~ /^li$/i)				) {			#right-shift "-li" in Czech			$text = $text.$prependSpace.$words[$i].$words[$i+1];			$i++; # advance over the dash			$prependSpace = "";		} elsif ($words[$i] =~ /^[\'\"]+$/) {			#combine punctuation smartly			if (($quoteCount{$words[$i]} % 2) eq 0) {				if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {					#single quote for posesssives ending in s... "The Jones' house"					#left shift					$text=$text.$words[$i];					$prependSpace = " ";				} else {					#right shift					$text = $text.$prependSpace.$words[$i];					$prependSpace = "";					$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;				}			} else {				#left shift				$text=$text.$words[$i];				$prependSpace = " ";				$quoteCount{$words[$i]} = $quoteCount{$words[$i]} + 1;			}					} else {			$text=$text.$prependSpace.$words[$i];			$prependSpace = " ";		}	}		# clean up spaces at head and tail of each line as well as any double-spacing	$text =~ s/ +/ /g;	$text =~ s/\n /\n/g;	$text =~ s/ \n/\n/g;	$text =~ s/^ //g;	$text =~ s/ $//g;		#add trailing break	$text .= "\n" unless $text =~ /\n$/;	return $text;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -