📄 extract_reuters.plx

📁 外国人写的Perl搜索引擎程序
💻 PLX
字号:
#!/usr/bin/perluse strict;use warnings;use File::Spec::Functions qw( catfile catdir );use Cwd qw( getcwd );# ensure call from correct location and with required argmy $source_dir = $ARGV[0];die "Usage: ./extract_reuters.plx /path/to/expanded/archive"    unless -d $source_dir;my $working_dir = getcwd;die "Must be run from the benchmarks/ directory"    unless ( $working_dir =~ /benchmarks\W*$/ );# create the main output directorymy $main_out_dir = 'extracted_corpus';if ( !-d $main_out_dir ) {    mkdir $main_out_dir or die "Couldn't mkdir '$main_out_dir': $!";}# get a list of the sgm filesopendir SOURCE_DIR, $source_dir or die "Couldn't open directory: $!";my @sgm_files = grep {/\.sgm$/} readdir SOURCE_DIR;closedir SOURCE_DIR or die "Couldn't close directory: $!";die "Couldn't find all the sgm files"    unless @sgm_files == 22;# track number of story docsmy $num_files = 0;for my $sgm_file (@sgm_files) {    # get the sgm file    my $sgm_filepath = catfile( $source_dir, $sgm_file );    print "Processing $sgm_filepath\n";    open( my $sgm_fh, '<', $sgm_filepath )        or die "Couldn't open file '$sgm_filepath': $!";    # prepare output directory    $sgm_file =~ /(\d+)\.sgm$/ or die "no match";    my $out_dir = catdir( $main_out_dir, "articles$1" );    if ( !-d $out_dir ) {        mkdir $out_dir or die "Couldn't create directory '$out_dir': $!";    }    my $in_body  = 0;    my $in_title = 0;    my ( $title, $body );    while (<$sgm_fh>) {        # start a new story doc        if (/<REUTERS/) {            $title = '';            $body  = '';        }        # extract title and body        if (s/.*?<TITLE>//) {            $in_title = 1;            $title    = '';        }        $title .= $_ if $in_title;        if (s/.*?<BODY>//) {            $in_body = 1;            $body    = '';        }        $body .= $_ if $in_body;        if (m#</TITLE>.*#) {            $in_title = 0;            $title =~ s#</TITLE>.*##s;        }        if (m#</BODY>.*#) {            $in_body = 0;            $body =~ s#</BODY>.*##s;        }        # write out a finished article doc        if (m#</REUTERS>#) {            die "Malformed data" if ( $in_title or $in_body );            if ( length $title and length $body ) {                my $out_filename = sprintf( "article%05d.txt", $num_files );                my $out_filepath = catfile( $out_dir, $out_filename );                open( my $out_fh, '>', $out_filepath )                    or die "Couldn't open '$out_filepath' for writing: $!";                $title =~ s/^\s*//;                $title =~ s/\s*$//;                print $out_fh "$title\n\n" or die "print failed: $!";                print $out_fh $body        or die "print failed: $!";                close $out_fh or die "Couldn't close '$out_filepath': $!";                $num_files++;            }        }    }}print "Total articles extracted: $num_files\n";__END__=head1 NAMEextract_reuters.plx - parse Reuters 21578 corpus into individual files=head1 SYNOPSIS    ./extract_reuters.plx /path/to/expanded/reuters/archive=head1 DESCRIPTIONThis script will extract TITLE and BODY for each item in the Reuters 21578corpus into individual files.  It expects to be passed the location of thedecompressed archive as a command line argument.=head1 AUTHORMarvin Humphrey E<lt> marvin at rectangular dot com E<gt>.=head1 COPYRIGHT AND LICENSECopyright 2006 Marvin HumphreyThis program is free software; you can redistribute it and/or modify itunder the same terms as Perl itself.=cut
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -