regexphtmllinkextractor.java

来自「Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按」· Java 代码 · 共 460 行 · 第 1/2 页
JAVA
460 行
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * SimpleHTMLExtractor.java * Created on Jun 5, 2003 * * $Header$ */package org.archive.extractor;import java.util.ArrayList;import java.util.Iterator;import java.util.LinkedList;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.httpclient.URIException;import org.archive.crawler.extractor.Link;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.DevUtils;import org.archive.util.TextUtils;/** * Basic link-extraction, from an HTML content-body, * using regular expressions. * * ROUGH DRAFT IN PROGRESS / incomplete... untested... * * @author gojomo */public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor {    private static Logger logger =        Logger.getLogger(RegexpHTMLLinkExtractor.class.getName());    boolean honorRobots = true;    boolean extractInlineCss = true;    boolean extractInlineJs = true;    protected LinkedList<Link> next = new LinkedList<Link>();    protected Matcher tags;    /* (non-Javadoc)     * @see org.archive.extractor.CharSequenceLinkExtractor#findNextLink()     */    protected boolean findNextLink() {        if (tags == null) {            tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, sourceContent);        }        while(tags.find()) {            if(Thread.interrupted()){                // TODO: throw an exception, perhaps, rather than just clear & break?                break;            }            if (tags.start(8) > 0) {                // comment match                // for now do nothing            } else if (tags.start(7) > 0) {                // <meta> match                int start = tags.start(5);                int end = tags.end(5);                processMeta(sourceContent.subSequence(start, end));            } else if (tags.start(5) > 0) {                // generic <whatever> match                int start5 = tags.start(5);                int end5 = tags.end(5);                int start6 = tags.start(6);                int end6 = tags.end(6);                processGeneralTag(sourceContent.subSequence(start6, end6),                        sourceContent.subSequence(start5, end5));            } else if (tags.start(1) > 0) {                // <script> match                int start = tags.start(1);                int end = tags.end(1);                processScript(sourceContent.subSequence(start, end),                    tags.end(2) - start);            } else if (tags.start(3) > 0){                // <style... match                int start = tags.start(3);                int end = tags.end(3);                processStyle(sourceContent.subSequence(start, end),                    tags.end(4) - start);            }            if(!next.isEmpty()) {                // at least one link found                return true;            }        }        // no relevant tags found        return false;    }    /**     * Compiled relevant tag extractor.     *     * <p>     * This pattern extracts either:     * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or     * <li> (2) &lt;style&gt;...&lt;/style&gt; or     * <li> (3) &lt;meta ...&gt; or     * <li> (4) any other open-tag with at least one attribute     * (eg matches "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")     * <p>     * groups:     * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT     * <li> 2: just script open tag     * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE     * <li> 4: just style open tag     * <li> 5: entire other tag, without '<' '>'     * <li> 6: element     * <li> 7: META     * <li> 8: !-- comment --     */    static final String RELEVANT_TAG_EXTRACTOR =          "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>";    // this pattern extracts attributes from any open-tag innards    // matched by the above. attributes known to be URIs of various    // sorts are matched specially    static final String EACH_ATTRIBUTE_EXTRACTOR =      "(?is)\\s((href)|(action)|(on\\w*)"     +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)"     +"|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))"     +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)"     +"|(value)|([-\\w]+))"     +"\\s*=\\s*"     +"(?:(?:\"(.*?)(?:\"|$))"     +"|(?:'(.*?)(?:'|$))"     +"|(\\S+))";    // groups:    // 1: attribute name    // 2: HREF - single URI relative to doc base, or occasionally javascript:    // 3: ACTION - single URI relative to doc base, or occasionally javascript:    // 4: ON[WHATEVER] - script handler    // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE,DATASRC, or FOR    //    single URI relative to doc base    // 6: CODEBASE - a single URI relative to doc base, affecting other    //    attributes    // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)    // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE    //    (if supplied)    // 9: CODE - a single URI relative to the CODEBASE (is specified).    // 10: VALUE - often includes a uri path on forms    // 11: any other attribute    // 12: double-quote delimited attr value    // 13: single-quote delimited attr value    // 14: space-delimited attr value    // much like the javascript likely-URI extractor, but    // without requiring quotes -- this can indicate whether    // an HTML tag attribute that isn't definitionally a    // URI might be one anyway, as in form-tag VALUE attributes    static final String LIKELY_URI_PATH =     "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";    static final String ESCAPED_AMP = "&amp;";    static final String AMP ="&";    static final String WHITESPACE = "\\s";    static final String CLASSEXT =".class";    static final String APPLET = "applet";    static final String BASE = "base";    static final String LINK = "link";    protected boolean processGeneralTag(CharSequence element, CharSequence cs) {        Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);        // Just in case it's an OBJECT or APPLET tag        String codebase = null;        ArrayList<String> resources = null;        long tally = next.size();        while (attr.find()) {            int valueGroup =                (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;            int start = attr.start(valueGroup);            int end = attr.end(valueGroup);            CharSequence value = cs.subSequence(start, end);            if (attr.start(2) > -1) {                // HREF                CharSequence context = Link.elementContext(element, attr.group(2));                if(element.toString().equalsIgnoreCase(LINK)) {                    // <LINK> elements treated as embeds (css, ico, etc)                    processEmbed(value, context);                } else {                    if (element.toString().equalsIgnoreCase(BASE)) {                        try {                            base = UURIFactory.getInstance(value.toString());                        } catch (URIException e) {                            extractErrorListener.noteExtractError(e,source,value);                        }                    }                    // other HREFs treated as links                    processLink(value, context);                }            } else if (attr.start(3) > -1) {                // ACTION                CharSequence context = Link.elementContext(element, attr.group(3));                processLink(value, context);            } else if (attr.start(4) > -1) {                // ON____                processScriptCode(value); // TODO: context?            } else if (attr.start(5) > -1) {                // SRC etc.                CharSequence context = Link.elementContext(element, attr.group(5));                processEmbed(value, context);            } else if (attr.start(6) > -1) {                // CODEBASE                // TODO: more HTML deescaping?                codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP);                CharSequence context = Link.elementContext(element,attr.group(6));                processEmbed(codebase, context);            } else if (attr.start(7) > -1) {
regexphtmllinkextractor.java - 源码说明

本页面展示了「Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。」中的 regexphtmllinkextractor.java 源码文件，采用 Java 编程语言编写，共 460 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Heritrix相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?