doc2mat

来自「用perl编写的文本转换为向量空间模型的程序」· 代码 · 共 507 行 · 第 1/2 页
TXT
507 行
#!/usr/bin/perl## doc2mat## This file contains a simple program for creating a CLUTO-compatible # mat-file from a set of documents. # For more information on how to use it do a 'doc2mat -help'# # V1.0.0        Wed Sep 11 23:13:55 CDT 2002 #use Getopt::Long;use Pod::Usage;#use Pod::Html;#pod2html("doc2mat", "--outfile=doc2mat.html");#==============================================================================# Built-in stop list#==============================================================================%stop_list = ('different','1', 'n','1', 'necessary','1', 'need','1', 'needed','1', 'needing','1', 'newest','1',              'next','1', 'no','1', 'nobody','1', 'non','1', 'noone','1', 'not','1', 'nothing','1', 'now','1',              'nowhere','1', 'of','1', 'off','1', 'often','1', 'new','1', 'old','1', 'older','1', 'oldest','1',              'on','1', 'once','1', 'one','1', 'only','1', 'open','1', 'again','1', 'among','1', 'already','1',              'about','1', 'above','1', 'against','1', 'alone','1', 'after','1', 'also','1', 'although','1',              'along','1', 'always','1', 'an','1', 'across','1', 'b','1', 'and','1', 'another','1', 'ask','1',              'c','1', 'asking','1', 'asks','1', 'backed','1', 'away','1', 'a','1', 'should','1', 'show','1',              'came','1', 'all','1', 'almost','1', 'before','1', 'began','1', 'back','1', 'backing','1',              'be','1', 'became','1', 'because','1', 'becomes','1', 'been','1', 'at','1', 'behind','1',              'being','1', 'best','1', 'better','1', 'between','1', 'big','1', 'showed','1', 'ended','1',              'ending','1', 'both','1', 'but','1', 'by','1', 'asked','1', 'backs','1', 'can','1', 'cannot','1',              'number','1', 'numbers','1', 'o','1', 'few','1', 'find','1', 'finds','1', 'clearly','1',               'her','1', 'herself','1', 'come','1', 'could','1', 'd','1', 'did','1', 'here','1', 'beings','1',              'fact','1', 'far','1', 'felt','1', 'become','1', 'first','1', 'for','1', 'four','1', 'from','1',              'full','1', 'fully','1', 'furthers','1', 'gave','1', 'general','1', 'generally','1', 'get','1',              'gets','1', 'gives','1', 'facts','1', 'go','1', 'going','1', 'good','1', 'goods','1', 'certain','1',              'certainly','1', 'clear','1', 'great','1', 'greater','1', 'greatest','1', 'group','1', 'grouped','1',              'grouping','1', 'groups','1', 'h','1', 'got','1', 'has','1', 'g','1', 'have','1', 'having','1',              'he','1', 'further','1', 'furthered','1', 'had','1', 'furthering','1', 'itself','1', 'faces','1',              'highest','1', 'him','1', 'himself','1', 'his','1', 'how','1', 'however','1', 'i','1', 'if','1',              'important','1', 'interests','1', 'into','1', 'is','1', 'it','1', 'its','1', 'j','1', 'anyone','1',              'anything','1', 'anywhere','1', 'are','1', 'area','1', 'areas','1', 'around','1', 'as','1', 'seconds','1',              'see','1', 'seem','1', 'seemed','1', 'seeming','1', 'seems','1', 'sees','1', 'right','1', 'several','1',              'shall','1', 'she','1', 'enough','1', 'even','1', 'evenly','1', 'over','1', 'p','1', 'part','1',              'parted','1', 'parting','1', 'parts','1', 'per','1', 'down','1', 'place','1', 'places','1',              'point','1', 'pointed','1', 'pointing','1', 'points','1', 'possible','1', 'present','1', 'presented','1',              'presenting','1', 'ends','1', 'high','1', 'mrs','1', 'much','1', 'must','1', 'my','1', 'myself','1',              'presents','1', 'down','1', 'problem','1', 'problems','1', 'put','1', 'puts','1', 'q','1', 'quite','1',              'will','1', 'with','1', 'within','1', 'r','1', 'rather','1', 'really','1', 'room','1', 'rooms','1',              's','1', 'said','1', 'same','1', 'right','1', 'showing','1', 'shows','1', 'side','1', 'sides','1',              'since','1', 'small','1', 'smaller','1', 'smallest','1', 'so','1', 'some','1', 'somebody','1',              'someone','1', 'something','1', 'somewhere','1', 'state','1', 'states','1', 'such','1', 'sure','1',              't','1', 'take','1', 'taken','1', 'than','1', 'that','1', 'the','1', 'their','1', 'then','1',              'there','1', 'therefore','1', 'these','1', 'x','1', 'thought','1', 'thoughts','1', 'three','1',              'through','1', 'thus','1', 'to','1', 'today','1', 'together','1', 'too','1', 'took','1', 'toward','1',              'turn','1', 'turned','1', 'turning','1', 'turns','1', 'two','1', 'still','1', 'u','1', 'under','1',              'until','1', 'up','1', 'others','1', 'upon','1', 'us','1', 'use','1', 'used','1', 'uses','1',              'v','1', 'very','1', 'w','1', 'want','1', 'wanted','1', 'wanting','1', 'wants','1', 'was','1',              'way','1', 'we','1', 'well','1', 'wells','1', 'went','1', 'were','1', 'what','1', 'when','1',              'where','1', 'whether','1', 'which','1', 'while','1', 'who','1', 'whole','1', 'y','1', 'year','1',              'years','1', 'yet','1', 'you','1', 'everyone','1', 'everything','1', 'everywhere','1', 'young','1',              'younger','1', 'youngest','1', 'your','1', 'yours','1', 'z','1', 'ever','1', 'works','1', 'every','1',              'everybody','1', 'f','1', 'face','1', 'other','1', 'our','1', 'out','1', 'just','1', 'interesting','1',              'high','1', 'might','1', 'k','1', 'keep','1', 'keeps','1', 'give','1', 'given','1', 'higher','1',              'kind','1', 'knew','1', 'know','1', 'known','1', 'knows','1', 'l','1', 'large','1', 'largely','1',              'last','1', 'later','1', 'latest','1', 'least','1', 'less','1', 'needs','1', 'never','1', 'newer','1',              'let','1', 'lets','1', 'like','1', 'likely','1', 'long','1', 'high','1', 'longer','1', 'longest','1',              'm','1', 'made','1', 'make','1', 'making','1', 'man','1', 'many','1', 'may','1', 'me','1', 'member','1',              'members','1', 'men','1', 'more','1', 'in','1', 'interest','1', 'interested','1', 'most','1', 'mostly','1',              'mr','1', 'opened','1', 'opening','1', 'new','1', 'opens','1', 'or','1', 'perhaps','1', 'order','1',              'ordered','1', 'ordering','1', 'orders','1', 'differ','1', 'differently','1', 'do','1', 'does','1',              'done','1', 'downed','1', 'downing','1', 'downs','1', 'they','1', 'thing','1', 'things','1', 'think','1',              'thinks','1', 'this','1', 'those','1', 'ways','1', 'why','1', 'without','1', 'work','1', 'worked','1',              'working','1', 'would','1', 'during','1', 'e','1', 'each','1', 'early','1', 'either','1', 'end','1',              'though','1', 'still','1', 'whose','1', 'saw','1', 'say','1', 'says','1', 'them','1', 'second','1',              'any','1', 'anybody','1');  #==============================================================================# Parse Command Line Arguments#==============================================================================$nostem      = 0;$nostop      = 0;$mystoplist  = '';$minwlen     = 3;$nlskip      = 0;$tokfile     = 0;$skipnumeric = 0;$help        = '';$docfile     = '';$matfile     = '';$clabelfile  = '';GetOptions('skipnumeric' => \$skipnumeric, 'tokfile' => \$tokfile, 'nostem' => \$nostem,, 'nostop' => \$nostop, 'mystoplist=s' => \$mystoplist, 'minwlen=i' => \$minwlen, 'nlskip=i' => \$nlskip, 'help|?' => \$help);pod2usage(-verbose => 2) if $help;pod2usage(-verbose => 2) if $#ARGV != 1;$docfile       = $ARGV[0];$matfile       = $ARGV[1];$clabelfile    = $matfile . ".clabel";$rlabelfile    = $matfile . ".rlabel";$tokenizedfile = $matfile . ".tokens";$tmpmatfile    = $matfile . ".tmp";-e $docfile or die "***Error: Input document file ", $docfile, " does not exist.\n";if ($mystoplist) {  -e $mystoplist or die "***Error: User supplied stop list file ", $mystoplist, " does not exist.\n";}#==============================================================================# Read the user-supplied stop-list if any #==============================================================================%my_stop_list = ();if ($mystoplist) {  print "Reading user supplied stop list file...\n";  open(FPIN, "<$mystoplist");  while (<FPIN>) {    tr/A-Z/a-z/;    # change to lower case     s/^\s+//;       # remove leading spaces    y/a-z0-9/ /cs;  # retain only alpha-numeric entries    s/\s+/ /g;      # compact spaces    chop;    @tokens = split(/\s+/, $_);    foreach $token (@tokens) {      $my_stop_list{$token} = 1;    }  }  close(FPIN);  print "Done.\n";  if ($nostop) {    %stop_list = ();    $nostop = 0;  }}#==============================================================================# Setup the data-structures for the stemmer and initialize it#==============================================================================%step2list = ('ational'=>'ate', 'tional'=>'tion', 'enci'=>'ence', 'anci'=>'ance',               'izer'=>'ize', 'bli'=>'ble', 'alli'=>'al', 'entli'=>'ent', 'eli'=>'e',               'ousli'=>'ous', 'ization'=>'ize', 'ation'=>'ate', 'ator'=>'ate',               'alism'=>'al', 'iveness'=>'ive', 'fulness'=>'ful', 'ousness'=>'ous',               'aliti'=>'al', 'iviti'=>'ive', 'biliti'=>'ble', 'logi'=>'log');%step3list = ('icate'=>'ic', 'ative'=>'', 'alize'=>'al', 'iciti'=>'ic', 'ical'=>'ic',               'ful'=>'', 'ness'=>'');$c =    "[^aeiou]";          # consonant$v =    "[aeiouy]";          # vowel$C =    "${c}[^aeiouy]*";    # consonant sequence$V =    "${v}[aeiou]*";      # vowel sequence$mgr0 = "^(${C})?${V}${C}";               # [C]VC... is m>0$meq1 = "^(${C})?${V}${C}(${V})?" . '$';  # [C]VC[V] is m=1$mgr1 = "^(${C})?${V}${C}${V}${C}";       # [C]VCVC... is m>1$_v   = "^(${C})?${v}";                   # vowel in stem#==============================================================================# Get into the main text-processing part of the code#==============================================================================open(DOCFP, "<$docfile");open(MATFP, ">$tmpmatfile");if ($tokfile) {  open(TOKENFP, ">$tokenizedfile");}if ($nlskip > 0) {  open(RLABELFP, ">$rlabelfile");}%WORDID    = ();%WORDNAMES = ();$nrows  = 0;$ncols  = 0;$nnz    = 0;print "Reading document file...\n";while (<DOCFP>) {  tr/A-Z/a-z/;  y/a-z0-9/ /cs;  s/^\s+//;  s/\s+/ /g;  chop;  @tokens = split(/\s+/, $_);  # Write the skipped tokens as the row-label of the file  if ($nlskip > 0) {    for ($i=0; $i<$nlskip; $i++) {      print RLABELFP $tokens[$i], " ";    }    print RLABELFP "\n";  }  # Construct the TF-representation for this document  %TF = ();  for ($i=$nlskip; $i<=$#tokens; $i++) {    next if ($skipnumeric && ($tokens[$i] =~ /\d/));    next if (length($tokens[$i]) < $minwlen);    if ($nostop) {      if ($nostem) {        $newword = $tokens[$i];      }      else {        $newword = stem($tokens[$i]);      }      if ($tokfile) {        print TOKENFP $newword, " ";      }      $TF{$newword}++;    }    else {      if (!$stop_list{$tokens[$i]} && !$my_stop_list{$tokens[$i]}) {        if ($nostem) {          $newword = $tokens[$i];        }        else {          $newword = stem($tokens[$i]);        }        if ($tokfile) {          print TOKENFP $newword, " ";        }        $TF{$newword}++;      }    }  }  if ($tokfile) {    print TOKENFP "\n";  }  # Write out the vector for this document
doc2mat - 源码说明

本页面展示了「用perl编写的文本转换为向量空间模型的程序」中的 doc2mat 源码文件，采用编程语言编写，共 507 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与perl相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?