⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 eng_morph.pl

📁 一个基于PERL的语言处理工具包. 其中包含文本处理,词汇过滤及一个英语词汇词法处理工具(STEMMER),适用于处理一些基本的.TXT文件.
💻 PL
字号:
#!/usr/bin/env perl## Purpose: Morph English sentences## usage: cat input | $0 eng_morph_table > output## Input: it is either a word sequence##        or a word/pos sequence.####        POS tags come from PTB.#### Ouput: the stem of the worduse strict;main();1;##############sub main {    if(@ARGV < 2){        die "usage: input_file eng_morph_table > output\n";    }    my $argv1 = $ARGV[0];    my $morph_table_file = $ARGV[1];    ######### step 0: read input file    open (input, $argv1);    ######### step 1: read morph table    my %eng_morph_table = ();    open(my $morph_table_fp, "$morph_table_file") or        die "cannot open $morph_table_file\n";    my ($entry_num, $morph_line) =        read_morph_table($morph_table_fp, \%eng_morph_table);    print STDERR "finish reading $morph_line lines and $entry_num entries from    $morph_table_file\n";    ##### step 2: morph a sentence    my %keywords = ();    while(my $input_line = <input>){         chomp($input_line);         # take care special cases         $input_line =~ s/n't/ not/g; # n't problem         # process all pounctuations exept "-"         $input_line =~ s/[^\d^\w^\*^-]/ /g;         my $sent = $input_line;         my @parts = split(/\s+/, $sent);         my @stems = ();         foreach my $part (@parts){                 my $res;                 if($part =~ /^(.+)\/(.+)$/){                     my $word = $1;                     my $tag = $2;                     $res = morph_word_w_pos_tag($word, $tag, \%eng_morph_table);                 }else{                  $res = morph_word($part, \%eng_morph_table);                 }                 $keywords{$res} += 1;         }    }#while end....    print STDERR "All done\n";    my $keyword = "";    foreach $keyword (sort keys(%keywords)) {        print ($keyword."\n");    }}###### the following is copied from eng_morph.pmsub morph_word_w_pos_tag {    my ($word, $tag, $morph_table_ptr) = @_;    my $res = $word;    my $cons = 'b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|z';    if($tag !~ /(VBD|VBN|VBZ|VBG|NNS)/){        return $res;    }    my  $lc_word = $word;    $lc_word =~ tr/A-Z/a-z/;    my $tmp = $morph_table_ptr->{$lc_word};    if(defined($tmp)){        return $tmp;    }    if($word =~ /^\'(d|s|ll)$/){        return $res;    }    if($tag =~ /(VBZ|NNS)/){        ### 3rd person or plural        $res =~ s/ies$/y/i;        ## tries => try        $res =~ s/($cons)ves$/$1f/i;        ## shelves => shelf        $res =~ s/(ch|sh|s|z|x)es$/$1/i;  ## teaches => teach        $res =~ s/($cons)(o)es$/$1$2/i;   ## goes => go        $res =~ s/((.){2,})s$/$1/i;        return $res;    }    if($tag =~ /VBG/){        $res =~ s/((.){2,})ing/$1/i;        if(($res =~ /(.)(.)$/i) && ($1 eq $2)){            $res =~ s/(.)$//;   ## hitting => hit        }        return $res;    }    if($tag =~ /(VBD|VBN)/){        $res =~ s/($cons)ied$/$1y/i;        $res =~ s/((.){2,})ed$/$1/i;        if(($res =~ /(.)(.)$/i) && ($1 eq $2)){            $res =~ s/(.)$//;   ## planned => plan        }        return $res;    }    return $res;}#### morph a word without knowing the POS tagsub morph_word {    my ($word, $morph_table_ptr) = @_;    my $res = $word;    my $cons = 'b|c|d|f|g|h|j|k|l|m|n|p|q|r|s|t|v|w|x|z';    my  $lc_word = $word;    $lc_word =~ tr/A-Z/a-z/;    #### check the morph table    my $tmp = $morph_table_ptr->{$lc_word};    if(defined($tmp)){        return $tmp;    }    my @tmp = split(//, $word);    my $word_leng = scalar @tmp;    if($word_leng < 4){  ## do not morph very short words with the rules        return $res;    }    if($word =~ /^\'(d|s|ll)$/){        return $res;    }    ### 3rd person or plural    $res =~ s/ies$/y/i;               ## tries => try    $res =~ s/($cons)ves$/$1f/i;      ## shelves => shelf    $res =~ s/(ch|sh|s|z|x)es$/$1/i;  ## teaches => teach    $res =~ s/($cons)(o)es$/$1$2/i;   ## goes => go    if($res eq $word){        ### those rules are very risky        $res =~ s/((.){4,})s$/$1/i;       ## breaks => break    }    if($res eq $word){        $res =~ s/((.){4,})ing$/$1/i;     ## waiting => wait    }    $res =~ s/($cons)ied$/$1y/i;      ## worried => worry    $res =~ s/((.){2,})ed$/$1/i;      ## waited => wait    if(($res ne $word) && ($res =~ /($cons)\1$/i)){        $res =~ s/($cons)$//;   ## planned => plan, hitting => hit    }    return $res;}## return the number of entries in the hash table.##sub read_morph_table {    my ($fp, $hash_ptr) = @_;    my $cnt = 0;    my $line_num ++;    while(<$fp>){        chomp;        next if(/^\s*$/);        s/\s+$//;        s/^\s+//;        my @parts = split(/\s+/);        my $root = $parts[0];        $line_num ++;        for(my $i=1; $i<scalar @parts; $i++){            my $wd = $parts[$i];            my $tmp = $hash_ptr->{$wd};            if(defined($tmp)){                if($tmp ne $root){                    print STDERR "$wd has two roots: $tmp and $root: keep $tmp only\n";                }            }else{                $hash_ptr->{$wd} = $root;                $cnt ++;            }        }    }    return ($cnt, $line_num);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -