htmlsplit.pl

来自「namazu. 虽然是日语的,也适用于文件中单词索引后全文检索.」· PL 代码 · 共 176 行

176 行

## -*- Perl -*-# $Id: htmlsplit.pl,v 1.9.4.3 2001/09/19 07:37:08 knok Exp $## Copyright (C) 2000 Namazu Project All rights reserved.#     This is free software with ABSOLUTELY NO WARRANTY.##  This program is free software; you can redistribute it and/or modify#  it under the terms of the GNU General Public License as published by#  the Free Software Foundation; either versions 2, or (at your option)#  any later version.# #  This program is distributed in the hope that it will be useful#  but WITHOUT ANY WARRANTY; without even the implied warranty of#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the#  GNU General Public License for more details.##  You should have received a copy of the GNU General Public License#  along with this program; if not, write to the Free Software#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA#  02111-1307, USA##  This file must be encoded in EUC-JP encodingpackage htmlsplit;require "util.pl";#require "html.pl"; # don't need it because it sould be already loaded by load_filtermodules()use strict;my $Header = << 'EOS';<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"        "http://www.w3.org/TR/html4/strict.dtd"><html><head><link rev=made href="mailto:${author}"><title>${subject}</title></head><body><hr>EOS    my $Footer = << 'EOS';<hr></body></html>EOSsub split ($$) {    my ($fname, $base) = @_;    my $mtime = (stat($fname))[9];    my $fh = util::efopen($fname);    my $cont   = join '', <$fh>;    my %info = (		'title'    => get_title(\$cont),		'author'   => get_author(\$cont),		'anchored' => "",		'name'     => "",		'base'     => $base,		'names'    => [],		);    # <http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2>    #     # In certain cases, authors may specify the value of an attribute    # without any quotation marks. The attribute value may only contain    # letters (a-z and A-Z), digits (0-9), hyphens (ASCII decimal 45), and    # periods (ASCII decimal 46). We recommend using quotation marks even    # when it is possible to eliminate them.    my $id = 0;#    $cont =~ s/(<a\s[^>]*href=(["']))#(.+?)(\2[^>]*>)/$1$3.html$4/gi; #'    $cont =~ s#(<a[^>]*\s+)name=(["'])\2([^>]*>(.*?)</a>)#$1$4#sgi; #'    $cont =~ s {                \G(.+?)                                      # 1	        (<h([1-6])>)?\s*                             # 2, 3                <a[^>]*\s+name=([a-zA-Z0-9-\.]+|             # 4,                (["']).+?\5)[^>]*>(.*?)</a>                  # 5,6                \s*(</h\3>)?                                 # 7             } {                write_partial_file($1, $4, $6, $id++, $mtime, \%info)             }sgexi;    write_partial_file($cont, "", "", $id, $mtime, \%info);    return @{$info{'names'}};}sub get_title ($) {    my ($contref) = @_;    my $title = undef;        if ($$contref =~ s!<TITLE[^>]*>([^<]+)</TITLE>!!i) {	$title = $1;	$title =~ s/\s+/ /g;	$title =~ s/^\s+//;	$title =~ s/\s+$//;    } else {	$title = "no title";    }    return $title;}sub get_author ($) {    my ($contref) = @_;    my $author = "unknown";    # <LINK REV=MADE HREF="mailto:ccsatoru@vega.aichi-u.ac.jp">    if ($$contref =~ m!<LINK\s[^>]*?HREF=([\"\'])mailto:(.*?)\1\s*>!i) { #"	$author = $2;    } elsif ($$contref =~ m!.*<ADDRESS[^>]*>([^<]*?)</ADDRESS>!i) {	my $tmp = $1;	if ($tmp =~ /\b([\w\.\-]+\@[\w\.\-]+(?:\.[\w\.\-]+)+)\b/) {	    $author = $1;	}    }    return $author;}sub write_partial_file($$$$$$) {    my ($cont, $name, $anchored, $id, $mtime, $info_ref) = @_;    $name =~ s/^([\"\'])(.*)\1$/$2/;  # Remove quotation marks.    my $author        = $info_ref->{'author'};    my $base          = $info_ref->{'base'};    my $orig_title    = $info_ref->{'title'};    my $prev_name     = $info_ref->{'name'};    my $prev_anchored = $info_ref->{'anchored'};    $prev_name        =~ s#\n\r##sg;    $prev_name        =~ s#\n##sg;    html::remove_html_elements(\$prev_anchored);    $prev_anchored =~ s/^\s+//;    $prev_anchored =~ s/\s+$//;    my $title = $orig_title;    # FIXME: I don't know why this processing causes "Use of    # uninitialized value" warning if use $prev_anchored or    # $prev_name directly. perl's bug?    if ($prev_anchored ne "") {	$title .= ": $prev_anchored";    } elsif ($prev_name ne "") {	$title .= ": $prev_name";    }    my $fname = util::tmpnam("$base.$id");    my $fh = util::efopen(">$fname");    my $header = $Header;    $header =~ s/\$\{subject\}/$title/g;    $header =~ s/\$\{author\}/$author/g;    print $fh $header;    print $fh $cont;    my $footer = $Footer;    print $fh $footer;    push @{$info_ref->{'names'}}, $prev_name;    $info_ref->{'anchored'} = $anchored;    $info_ref->{'name'} = $name;    # FIXME: Actually we don't need this.     #        But some perl versions need this.    close($fh);    utime($mtime, $mtime, $fname);    return "";}1;

htmlsplit.pl - 源码说明

本页面展示了「namazu. 虽然是日语的,也适用于文件中单词索引后全文检索.」中的 htmlsplit.pl 源码文件，采用 PL 编程语言编写，共 176 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与namazu相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?