📄 eng_morph.pl
字号:
#!/usr/bin/env perl## Purpose: Morph English sentences## usage: cat input | $0 eng_morph_table > output## Input: it is either a word sequence## or a word/pos sequence.#### POS tags come from PTB.#### Ouput: the stem of the worduse strict;main();1;##############sub main { if(@ARGV < 2){ die "usage: input_file eng_morph_table > output\n"; } my $argv1 = $ARGV[0]; my $morph_table_file = $ARGV[1]; ######### step 0: read input file open (input, $argv1); ######### step 1: read morph table my %eng_morph_table = (); open(my $morph_table_fp, "$morph_table_file") or die "cannot open $morph_table_file\n"; my ($entry_num, $morph_line) = read_morph_table($morph_table_fp, \%eng_morph_table); print STDERR "finish reading $morph_line lines and $entry_num entries from $morph_table_file\n"; ##### step 2: morph a sentence my %keywords = (); while(my $input_line = <input>){ chomp($input_line); # take care special cases $input_line =~ s/n't/ not/g; # n't problem # process all pounctuations exept "-" $input_line =~ s/[^\d^\w^\*^-]/ /g; my $sent = $input_line; my @parts = split(/\s+/, $sent); my @stems = (); foreach my $part (@parts){ my $res; if($part =~ /^(.+)\/(.+)$/){ my $word = $1; my $tag = $2; $res = morph_word_w_pos_tag($word, $tag, \%eng_morph_table); }else{ $res = morph_word($part, \%eng_morph_table); } $keywords{$res} += 1; } }#while end.... print STDERR "All done\n"; my $keyword = ""; foreach $keyword (sort keys(%keywords)) { print ($keyword."\n"); }}###### the following is copied from eng_morph.pmsub morph_word_w_pos_tag { my ($word, $tag, $morph_table_ptr) = @_; my $res = $word; my $cons = 'b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|z'; if($tag !~ /(VBD|VBN|VBZ|VBG|NNS)/){ return $res; } my $lc_word = $word; $lc_word =~ tr/A-Z/a-z/; my $tmp = $morph_table_ptr->{$lc_word}; if(defined($tmp)){ return $tmp; } if($word =~ /^\'(d|s|ll)$/){ return $res; } if($tag =~ /(VBZ|NNS)/){ ### 3rd person or plural $res =~ s/ies$/y/i; ## tries => try $res =~ s/($cons)ves$/$1f/i; ## shelves => shelf $res =~ s/(ch|sh|s|z|x)es$/$1/i; ## teaches => teach $res =~ s/($cons)(o)es$/$1$2/i; ## goes => go $res =~ s/((.){2,})s$/$1/i; return $res; } if($tag =~ /VBG/){ $res =~ s/((.){2,})ing/$1/i; if(($res =~ /(.)(.)$/i) && ($1 eq $2)){ $res =~ s/(.)$//; ## hitting => hit } return $res; } if($tag =~ /(VBD|VBN)/){ $res =~ s/($cons)ied$/$1y/i; $res =~ s/((.){2,})ed$/$1/i; if(($res =~ /(.)(.)$/i) && ($1 eq $2)){ $res =~ s/(.)$//; ## planned => plan } return $res; } return $res;}#### morph a word without knowing the POS tagsub morph_word { my ($word, $morph_table_ptr) = @_; my $res = $word; my $cons = 'b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|z'; my $lc_word = $word; $lc_word =~ tr/A-Z/a-z/; #### check the morph table my $tmp = $morph_table_ptr->{$lc_word}; if(defined($tmp)){ return $tmp; } my @tmp = split(//, $word); my $word_leng = scalar @tmp; if($word_leng < 4){ ## do not morph very short words with the rules return $res; } if($word =~ /^\'(d|s|ll)$/){ return $res; } ### 3rd person or plural $res =~ s/ies$/y/i; ## tries => try $res =~ s/($cons)ves$/$1f/i; ## shelves => shelf $res =~ s/(ch|sh|s|z|x)es$/$1/i; ## teaches => teach $res =~ s/($cons)(o)es$/$1$2/i; ## goes => go if($res eq $word){ ### those rules are very risky $res =~ s/((.){4,})s$/$1/i; ## breaks => break } if($res eq $word){ $res =~ s/((.){4,})ing$/$1/i; ## waiting => wait } $res =~ s/($cons)ied$/$1y/i; ## worried => worry $res =~ s/((.){2,})ed$/$1/i; ## waited => wait if(($res ne $word) && ($res =~ /($cons)\1$/i)){ $res =~ s/($cons)$//; ## planned => plan, hitting => hit } return $res;}## return the number of entries in the hash table.##sub read_morph_table { my ($fp, $hash_ptr) = @_; my $cnt = 0; my $line_num ++; while(<$fp>){ chomp; next if(/^\s*$/); s/\s+$//; s/^\s+//; my @parts = split(/\s+/); my $root = $parts[0]; $line_num ++; for(my $i=1; $i<scalar @parts; $i++){ my $wd = $parts[$i]; my $tmp = $hash_ptr->{$wd}; if(defined($tmp)){ if($tmp ne $root){ print STDERR "$wd has two roots: $tmp and $root: keep $tmp only\n"; } }else{ $hash_ptr->{$wd} = $root; $cnt ++; } } } return ($cnt, $line_num);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -