📄 extract_reuters.plx
字号:
#!/usr/bin/perluse strict;use warnings;use File::Spec::Functions qw( catfile catdir );use Cwd qw( getcwd );# ensure call from correct location and with required argmy $source_dir = $ARGV[0];die "Usage: ./extract_reuters.plx /path/to/expanded/archive" unless -d $source_dir;my $working_dir = getcwd;die "Must be run from the benchmarks/ directory" unless ( $working_dir =~ /benchmarks\W*$/ );# create the main output directorymy $main_out_dir = 'extracted_corpus';if ( !-d $main_out_dir ) { mkdir $main_out_dir or die "Couldn't mkdir '$main_out_dir': $!";}# get a list of the sgm filesopendir SOURCE_DIR, $source_dir or die "Couldn't open directory: $!";my @sgm_files = grep {/\.sgm$/} readdir SOURCE_DIR;closedir SOURCE_DIR or die "Couldn't close directory: $!";die "Couldn't find all the sgm files" unless @sgm_files == 22;# track number of story docsmy $num_files = 0;for my $sgm_file (@sgm_files) { # get the sgm file my $sgm_filepath = catfile( $source_dir, $sgm_file ); print "Processing $sgm_filepath\n"; open( my $sgm_fh, '<', $sgm_filepath ) or die "Couldn't open file '$sgm_filepath': $!"; # prepare output directory $sgm_file =~ /(\d+)\.sgm$/ or die "no match"; my $out_dir = catdir( $main_out_dir, "articles$1" ); if ( !-d $out_dir ) { mkdir $out_dir or die "Couldn't create directory '$out_dir': $!"; } my $in_body = 0; my $in_title = 0; my ( $title, $body ); while (<$sgm_fh>) { # start a new story doc if (/<REUTERS/) { $title = ''; $body = ''; } # extract title and body if (s/.*?<TITLE>//) { $in_title = 1; $title = ''; } $title .= $_ if $in_title; if (s/.*?<BODY>//) { $in_body = 1; $body = ''; } $body .= $_ if $in_body; if (m#</TITLE>.*#) { $in_title = 0; $title =~ s#</TITLE>.*##s; } if (m#</BODY>.*#) { $in_body = 0; $body =~ s#</BODY>.*##s; } # write out a finished article doc if (m#</REUTERS>#) { die "Malformed data" if ( $in_title or $in_body ); if ( length $title and length $body ) { my $out_filename = sprintf( "article%05d.txt", $num_files ); my $out_filepath = catfile( $out_dir, $out_filename ); open( my $out_fh, '>', $out_filepath ) or die "Couldn't open '$out_filepath' for writing: $!"; $title =~ s/^\s*//; $title =~ s/\s*$//; print $out_fh "$title\n\n" or die "print failed: $!"; print $out_fh $body or die "print failed: $!"; close $out_fh or die "Couldn't close '$out_filepath': $!"; $num_files++; } } }}print "Total articles extracted: $num_files\n";__END__=head1 NAMEextract_reuters.plx - parse Reuters 21578 corpus into individual files=head1 SYNOPSIS ./extract_reuters.plx /path/to/expanded/reuters/archive=head1 DESCRIPTIONThis script will extract TITLE and BODY for each item in the Reuters 21578corpus into individual files. It expects to be passed the location of thedecompressed archive as a command line argument.=head1 AUTHORMarvin Humphrey E<lt> marvin at rectangular dot com E<gt>.=head1 COPYRIGHT AND LICENSECopyright 2006 Marvin HumphreyThis program is free software; you can redistribute it and/or modify itunder the same terms as Perl itself.=cut
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -