📄 extractorhtml.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA * * SimpleHTMLExtractor.java * Created on Jun 5, 2003 * * $Header$ */package org.archive.crawler.extractor;import java.io.IOException;import java.util.ArrayList;import java.util.Iterator;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.RobotsHonoringPolicy;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.io.ReplayCharSequence;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.DevUtils;import org.archive.util.HttpRecorder;import org.archive.util.TextUtils;/** * Basic link-extraction, from an HTML content-body, * using regular expressions. * * @author gojomo * */public class ExtractorHTML extends Extractorimplements CoreAttributeConstants {    private static final long serialVersionUID = 5855731422080471017L;    private static Logger logger =        Logger.getLogger(ExtractorHTML.class.getName());    /**     * Compiled relevant tag extractor.     *     * <p>     * This pattern extracts either:     * <li> (1) whole &lt;script&gt;...&lt;/script&gt; or     * <li> (2) &lt;style&gt;...&lt;/style&gt; or     * <li> (3) &lt;meta ...&gt; or     * <li> (4) any other open-tag with at least one attribute     * (eg matches "&lt;a href='boo'&gt;" but not "&lt;/a&gt;" or "&lt;br&gt;")     * <p>     * groups:     * <li> 1: SCRIPT SRC=foo&gt;boo&lt;/SCRIPT     * <li> 2: just script open tag     * <li> 3: STYLE TYPE=moo&gt;zoo&lt;/STYLE     * <li> 4: just style open tag     * <li> 5: entire other tag, without '<' '>'     * <li> 6: element     * <li> 7: META     * <li> 8: !-- comment --     */// version w/ less unnecessary backtracking      private static final int MAX_ELEMENT_LENGTH =          Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +              ".maxElementNameLength", "1024"));            static final String RELEVANT_TAG_EXTRACTOR =          "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2          "|((style[^>]*+)>.*?</style)" + // 3, 4          "|(((meta)|(?:\\w{1,"+MAX_ELEMENT_LENGTH+"}))\\s+[^>]*+)" + // 5, 6, 7          "|(!--.*?--))>"; // 8 //    version w/ problems with unclosed script tags //    static final String RELEVANT_TAG_EXTRACTOR =//    "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>";      //    // this pattern extracts 'href' or 'src' attributes from//    // any open-tag innards matched by the above//    static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(//     "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");////    // this pattern extracts 'robots' attributes//    static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(//     "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))");      private static final int MAX_ATTR_NAME_LENGTH =          Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +              ".maxAttributeNameLength", "1024")); // 1K;             static final int MAX_ATTR_VAL_LENGTH =           Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() +              ".maxAttributeValueLength", "16384")); // 16K;           // TODO: perhaps cut to near MAX_URI_LENGTH        // this pattern extracts attributes from any open-tag innards    // matched by the above. attributes known to be URIs of various    // sorts are matched specially    static final String EACH_ATTRIBUTE_EXTRACTOR =      "(?is)\\b((href)|(action)|(on\\w*)" // 1, 2, 3, 4      +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ...     +"|(?:usemap)|(?:profile)|(?:datasrc))" // 5     +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9     +"|(value)|(style)|(method)" // 10, 11, 12     +"|([-\\w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 13     +"\\s*=\\s*"     +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 14     +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 15     +"|(\\S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 16    // groups:    // 1: attribute name    // 2: HREF - single URI relative to doc base, or occasionally javascript:    // 3: ACTION - single URI relative to doc base, or occasionally javascript:    // 4: ON[WHATEVER] - script handler    // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC    //    single URI relative to doc base    // 6: CODEBASE - a single URI relative to doc base, affecting other    //    attributes    // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)    // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE    //    (if supplied)    // 9: CODE - a single URI relative to the CODEBASE (is specified).    // 10: VALUE - often includes a uri path on forms    // 11: STYLE - inline attribute style info    // 12: METHOD - form GET/POST    // 13: any other attribute    // 14: double-quote delimited attr value    // 15: single-quote delimited attr value    // 16: space-delimited attr value    // much like the javascript likely-URI extractor, but    // without requiring quotes -- this can indicate whether    // an HTML tag attribute that isn't definitionally a    // URI might be one anyway, as in form-tag VALUE attributes    static final String LIKELY_URI_PATH =     "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";    static final String WHITESPACE = "\\s";    static final String CLASSEXT =".class";    static final String APPLET = "applet";    static final String BASE = "base";    static final String LINK = "link";    static final String FRAME = "frame";    static final String IFRAME = "iframe";    public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS =        "treat-frames-as-embed-links";        public static final String ATTR_IGNORE_FORM_ACTION_URLS =        "ignore-form-action-urls";    public static final String ATTR_EXTRACT_ONLY_FORM_GETS =        "extract-only-form-gets";    /** whether to try finding links in Javscript; default true */    public static final String ATTR_EXTRACT_JAVASCRIPT =        "extract-javascript";    public static final String EXTRACT_VALUE_ATTRIBUTES =        "extract-value-attributes";        public static final String ATTR_IGNORE_UNEXPECTED_HTML =         "ignore-unexpected-html";        protected long numberOfCURIsHandled = 0;    protected long numberOfLinksExtracted = 0;    public ExtractorHTML(String name) {        this(name, "HTML extractor. Extracts links from HTML documents");    }        public ExtractorHTML(String name, String description) {        super(name, description);        Type t = addElementToDefinition(            new SimpleType(ATTR_EXTRACT_JAVASCRIPT,            "If true, in-page Javascript is scanned for strings that " +            "appear likely to be URIs. This typically finds both valid " +            "and invalid URIs, and attempts to fetch the invalid URIs " +            "sometimes generates webmaster concerns over odd crawler " +            "behavior. Default is true.",            Boolean.TRUE));        t.setExpertSetting(true);        t = addElementToDefinition(            new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS,            "If true, FRAME/IFRAME SRC-links are treated as embedded " +            "resources (like IMG, 'E' hop-type), otherwise they are " +            "treated as navigational links. Default is true.", Boolean.TRUE));        t.setExpertSetting(true);        t = addElementToDefinition(            new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS,            "If true, URIs appearing as the ACTION attribute in " +            "HTML FORMs are ignored. Default is false.", Boolean.FALSE));        t.setExpertSetting(true);        t = addElementToDefinition(                new SimpleType(ATTR_EXTRACT_ONLY_FORM_GETS,                "If true, only HTML FORM ACTIONs associated with the GET "+                 "method are extracted. (Form ACTIONs with method POST "+                "will be ignored. Default is true", Boolean.TRUE));        t.setExpertSetting(true);        t = addElementToDefinition(            new SimpleType(EXTRACT_VALUE_ATTRIBUTES,            "If true, strings that look like URIs found in element VALUE " +            "attributes (which are sometimes used as URIs by in-page " +            "Javascript or server-side redirects) will be extracted. " +            "This typically finds both valid and invalid URIs, and " +            "attempts to fetch the invalid URIs sometimes generate " +            "webmaster concerns over odd crawler behavior. Default " +            "is true.",            Boolean.TRUE));        t.setExpertSetting(true);        t = addElementToDefinition(            new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML,            "If true, URIs which end in typical non-HTML extensions " +            "(such as .gif) will not be scanned as if it were HTML. " +            "Default is true.", Boolean.TRUE));        t.setExpertSetting(true);    }    protected void processGeneralTag(CrawlURI curi, CharSequence element,            CharSequence cs) {        Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);        // Just in case it's an OBJECT or APPLET tag        String codebase = null;        ArrayList<String> resources = null;                // Just in case it's a FORM        CharSequence action = null;        CharSequence actionContext = null;        CharSequence method = null;                 final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi,            ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue();        final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi,                ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();                final boolean extractValueAttributes = ((Boolean)getUncheckedAttribute                (curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -