首页 › 资源下载 › 网络 › 嵌入式ＷＥＢ › 源码查看

indexer.pl

来自「嵌入式ＷＥＢ」· PL 代码 · 共 114 行

114 行

#!/usr/bin/perl -wT# This is not a CGI, so taint mode not requireduse strict;use File::Find;use DB_File;use Getopt::Long;use Text::English;use Fcntl;use constant DB_CACHE      => 0;use constant DEFAULT_INDEX => "/usr/local/apache/data/index.db";my( %opts, %index, @files, $stop_words );GetOptions( \%opts, "dir=s",                    "cache=s",                    "index=s",                    "ignore",                    "stop=s",                    "numbers",                    "stem" );die usage() unless $opts{dir} && -d $opts{dir};$opts{'index'}        ||= DEFAULT_INDEX;$DB_BTREE->{cachesize}  = $opts{cache} || DB_CACHE;$index{"!OPTION:stem"}   = 1 if $opts{'stem'};$index{"!OPTION:ignore"} = 1 if $opts{'ignore'};tie %index, "DB_File", $opts{'index'}, O_RDWR|O_CREAT, 0640    or die "Cannot tie database: $!\n";find( sub { push @files, $File::Find::name }, $opts{dir} );$stop_words = load_stopwords( $opts{stop} ) if $opts{stop};process_files( \%index, \@files, \%opts, $stop_words );untie %index;sub load_stopwords {    my $file = shift;    my $words = {};    local( *INFO, $_ );        die "Cannot file stop file: $file\n" unless -e $file;        open INFO, $file or die "$!\n";    while ( <INFO> ) {        next if /^#/;        $words->{lc $1} = 1 if /(\S+)/;    }        close INFO;        return $words;}sub process_files {    my( $index, $files, $opts, $stop_words ) = @_;    local( *FILE, $_ );    local $/ = "\n\n";        for ( my $file_id = 0; $file_id < @$files; $file_id++ ) {        my $file = $files[$file_id];        my %seen_in_file;                next unless -T $file;                print STDERR "Indexing $file\n";        $index->{"!FILE_NAME:$file_id"} = $file;                open FILE, $file or die "Cannot open file: $file!\n";                while ( <FILE> ) {                        tr/A-Z/a-z/ if $opts{ignore};            s/<.+?>//gs; # Note this doesn't handle < or > in comments or js                        while ( /([a-z\d]{2,})\b/gi ) {                my $word = $1;                next if $stop_words->{lc $word};                next if $word =~ /^\d+$/ && not $opts{number};                                ( $word ) = Text::English::stem( $word ) if $opts{stem};                                $index->{$word} = ( exists $index->{$word} ?                     "$index->{$word}:" : "" ) . "$file_id" unless                     $seen_in_file{$word}++;            }        }    }}sub usage {    my $usage = <<End_of_Usage;Usage: $0 -dir directory [options]The options are:  -cache         DB_File cache size (in bytes)  -index         Path to index, default:/usr/local/apache/data/index.db  -ignore        Case-insensitive index  -stop          Path to stopwords file  -numbers       Include numbers in index  -stem          Stem wordsEnd_of_Usage    return $usage;}

indexer.pl - 源码说明

本页面展示了「嵌入式ＷＥＢ」中的 indexer.pl 源码文件，采用 PL 编程语言编写，共 114 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与嵌入式相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?