📄 extractorhtml.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * SimpleHTMLExtractor.java * Created on Jun 5, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/extractor/ExtractorHTML.java,v 1.79 2006/08/22 03:21:54 stack-sf Exp $ */package org.archive.crawler.extractor;import java.io.IOException;import java.util.ArrayList;import java.util.Iterator;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.RobotsHonoringPolicy;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.io.ReplayCharSequence;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.DevUtils;import org.archive.util.HttpRecorder;import org.archive.util.TextUtils;/** * Basic link-extraction, from an HTML content-body, * using regular expressions. * * @author gojomo * */public class ExtractorHTML extends Extractorimplements CoreAttributeConstants {        private static Logger logger =        Logger.getLogger(ExtractorHTML.class.getName());    /**     * Compiled relevant tag extractor.     *     * <p>     * This pattern extracts either:     * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or     * <li> (2) &lt;style&gt;...&lt;/style&gt; or     * <li> (3) &lt;meta ...&gt; or     * <li> (4) any other open-tag with at least one attribute     * (eg matches "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")     * <p>     * groups:     * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT     * <li> 2: just script open tag     * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE     * <li> 4: just style open tag     * <li> 5: entire other tag, without '<' '>'     * <li> 6: element     * <li> 7: META     * <li> 8: !-- comment --     */// version w/ less unnecessary backtracking      private static final int MAX_ELEMENT_LENGTH =          Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +              ".maxElementNameLength", "1024"));            static final String RELEVANT_TAG_EXTRACTOR =          "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2          "|((style[^>]*+)>[^<]*+</style)" + // 3, 4          "|(((meta)|(?:\\w{1,"+MAX_ELEMENT_LENGTH+"}))\\s+[^>]*+)" + // 5, 6, 7          "|(!--.*?--))>"; // 8 //    version w/ problems with unclosed script tags //    static final String RELEVANT_TAG_EXTRACTOR =//    "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>";      //    // this pattern extracts 'href' or 'src' attributes from//    // any open-tag innards matched by the above//    static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(//     "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");////    // this pattern extracts 'robots' attributes//    static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(//     "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))");      private static final int MAX_ATTR_NAME_LENGTH =          Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +              ".maxAttributeNameLength", "1024")); // 1K;             static final int MAX_ATTR_VAL_LENGTH =           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +              ".maxAttributeValueLength", "16384")); // 16K;           // TODO: perhaps cut to near MAX_URI_LENGTH        // this pattern extracts attributes from any open-tag innards    // matched by the above. attributes known to be URIs of various    // sorts are matched specially    static final String EACH_ATTRIBUTE_EXTRACTOR =      "(?is)\\s((href)|(action)|(on\\w*)" // 1, 2, 3, 4      +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...     +"|(?:usemap)|(?:profile)|(?:datasrc))" // 5     +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9     +"|(value)|(style)|([-\\w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 10, 11, 12     +"\\s*=\\s*"     +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 13     +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 14     +"|(\\S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 15    // groups:    // 1: attribute name    // 2: HREF - single URI relative to doc base, or occasionally javascript:    // 3: ACTION - single URI relative to doc base, or occasionally javascript:    // 4: ON[WHATEVER] - script handler    // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC    //    single URI relative to doc base    // 6: CODEBASE - a single URI relative to doc base, affecting other    //    attributes    // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)    // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE    //    (if supplied)    // 9: CODE - a single URI relative to the CODEBASE (is specified).    // 10: VALUE - often includes a uri path on forms    // 11: STYLE - inline attribute style info    // 12: any other attribute    // 13: double-quote delimited attr value    // 14: single-quote delimited attr value    // 15: space-delimited attr value    // much like the javascript likely-URI extractor, but    // without requiring quotes -- this can indicate whether    // an HTML tag attribute that isn't definitionally a    // URI might be one anyway, as in form-tag VALUE attributes    static final String LIKELY_URI_PATH =     "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";    static final String WHITESPACE = "\\s";    static final String CLASSEXT =".class";    static final String APPLET = "applet";    static final String BASE = "base";    static final String LINK = "link";    static final String FRAME = "frame";    static final String IFRAME = "iframe";    public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS =        "treat-frames-as-embed-links";        public static final String ATTR_IGNORE_FORM_ACTION_URLS =        "ignore-form-action-urls";        public static final String ATTR_OVERLY_EAGER_LINK_DETECTION =        "overly-eager-link-detection";        public static final String ATTR_IGNORE_UNEXPECTED_HTML =         "ignore-unexpected-html";        protected long numberOfCURIsHandled = 0;    protected long numberOfLinksExtracted = 0;    public ExtractorHTML(String name) {        this(name, "HTML extractor. Extracts links from HTML documents");    }        public ExtractorHTML(String name, String description) {        super(name, description);        Type t = addElementToDefinition(            new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,            "If enabled, FRAME/IFRAME SRC-links are treated as embedded " +            "resources (IMG etc.), otherwise they are treated as " +            "navigational links", Boolean.TRUE));        t.setExpertSetting(true);        t = addElementToDefinition(            new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,            "If enabled, links appearing as the ACTION attribute in " +            "HTML FORMs are not extracted.", Boolean.FALSE));        t.setExpertSetting(true);        t = addElementToDefinition(                new SimpleType(ATTR_OVERLY_EAGER_LINK_DETECTION,                "If disabled (default is enabled), possible links will not be "+                "queued if they are placed in (somewhat) unlikely places " +                 "(Currently, just ignores URLs in HTML value attributes).",                Boolean.TRUE));        t.setExpertSetting(true);        t = addElementToDefinition(                new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,                "If enabled, html that is detected in unusual or unexpected " +                "places is not considerd for processing.", Boolean.TRUE));        t.setExpertSetting(true);    }    protected void processGeneralTag(CrawlURI curi, CharSequence element,            CharSequence cs) {        Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);        // Just in case it's an OBJECT or APPLET tag        String codebase = null;        ArrayList resources = null;                final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi,            ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();        final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi,                ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();                final boolean overlyEagerLinkDetection = ((Boolean)getUncheckedAttribute                (curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue();                final String elementStr = element.toString();        while (attr.find()) {            int valueGroup =                (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15;            int start = attr.start(valueGroup);            int end = attr.end(valueGroup);            assert start >= 0: "Start is: " + start + ", " + curi;            assert end >= 0: "End is :" + end + ", " + curi;            CharSequence value = cs.subSequence(start, end);            value = TextUtils.unescapeHtml(value);            if (attr.start(2) > -1) {                // HREF                CharSequence context =                    Link.elementContext(element, attr.group(2));                if(elementStr.equalsIgnoreCase(LINK)) {                    // <LINK> elements treated as embeds (css, ico, etc)                    processEmbed(curi, value, context);                } else {                    // other HREFs treated as links                    processLink(curi, value, context);                }                if (elementStr.equalsIgnoreCase(BASE)) {                    try {                        curi.setBaseURI(value.toString());                    } catch (URIException e) {                        if (getController() != null) {                            // Controller can be null: e.g. when running                            // ExtractorTool.                            getController().logUriError(e, curi.getUURI(),                                value.toString());                        } else {                            logger.info("Failed set base uri: " +                                curi + ", " + value.toString() + ": " +                                e.getMessage());                        }                    }                }            } else if (attr.start(3) > -1) {                // ACTION                if (!ignoreFormActions) {                    CharSequence context = Link.elementContext(element,                        attr.group(3));                    processLink(curi, value, context);                }            } else if (attr.start(4) > -1) {                // ON____                processScriptCode(curi, value); // TODO: context?            } else if (attr.start(5) > -1) {                // SRC etc.                CharSequence context = Link.elementContext(element,                    attr.group(5));                                // true, if we expect another HTML page instead of an image etc.                final char hopType;                                if(!framesAsEmbeds                    && (elementStr.equalsIgnoreCase(FRAME) || elementStr                        .equalsIgnoreCase(IFRAME))) {                    hopType = Link.NAVLINK_HOP;                } else {                    hopType = Link.EMBED_HOP;                }                processEmbed(curi, value, context, hopType);            } else if (attr.start(6) > -1) {                // CODEBASE                codebase = (value instanceof String)?                    (String)value: value.toString();                CharSequence context = Link.elementContext(element,                    attr.group(6));                processEmbed(curi, codebase, context);            } else if (attr.start(7) > -1) {                // CLASSID, DATA                if (resources == null) {                    resources = new ArrayList();                }                resources.add(value.toString());            } else if (attr.start(8) > -1) {                // ARCHIVE                if (resources==null) {                    resources = new ArrayList();                }                String[] multi = TextUtils.split(WHITESPACE, value);                for(int i = 0; i < multi.length; i++ ) {                    resources.add(multi[i]);                }            } else if (attr.start(9) > -1) {                // CODE                if (resources==null) {                    resources = new ArrayList();                }                // If element is applet and code value does not end with                // '.class' then append '.class' to the code value.                if (elementStr.equalsIgnoreCase(APPLET) &&                        !value.toString().toLowerCase().endsWith(CLASSEXT)) {                    resources.add(value.toString() + CLASSEXT);                } else {                    resources.add(value.toString());                }            } else if (attr.start(10) > -1) {                // VALUE, with possibility of URI                if (TextUtils.matches(LIKELY_URI_PATH, value)                        && overlyEagerLinkDetection) {                    CharSequence context = Link.elementContext(element,                        attr.group(10));                    processLink(curi,value, context);                }            } else if (attr.start(11) > -1) {                // STYLE inline attribute                // then, parse for URIs                this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(                    curi, value, getController());                            } else if (attr.start(12) > -1) {                // any other attribute                // ignore for now                // could probe for path- or script-looking strings, but                // those should be vanishingly rare in other attributes,                // and/or symptomatic of page bugs            }        }        TextUtils.recycleMatcher(attr);        // handle codebase/resources        if (resources == null) {            return;        }        Iterator iter = resources.iterator();        UURI codebaseURI = null;        String res = null;        try {            if (codebase != null) {                // TODO: Pass in the charset.                codebaseURI = UURIFactory.                    getInstance(curi.getUURI(), codebase);            }            while(iter.hasNext()) {                res = iter.next().toString();                res = (String) TextUtils.unescapeHtml(res);                if (codebaseURI != null) {                    res = codebaseURI.resolve(res).toString();
12 下一页
💿 文件大小 18588 K
👤 上传用户 bonylee_java
📂 所属分类 Jsp/Servlet
🏷️ 相关标签

#工程
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -