📄 extractorhtml.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * SimpleHTMLExtractor.java * Created on Jun 5, 2003 * * $Header$ */package org.archive.crawler.extractor;import java.io.IOException;import java.util.ArrayList;import java.util.Iterator;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.RobotsHonoringPolicy;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.io.ReplayCharSequence;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.DevUtils;import org.archive.util.HttpRecorder;import org.archive.util.TextUtils;/** * Basic link-extraction, from an HTML content-body, * using regular expressions. * * @author gojomo * */public class ExtractorHTML extends Extractorimplements CoreAttributeConstants { private static final long serialVersionUID = 5855731422080471017L; private static Logger logger = Logger.getLogger(ExtractorHTML.class.getName()); /** * Compiled relevant tag extractor. * * <p> * This pattern extracts either: * <li> (1) whole <script>...</script> or * <li> (2) <style>...</style> or * <li> (3) <meta ...> or * <li> (4) any other open-tag with at least one attribute * (eg matches "<a href='boo'>" but not "</a>" or "<br>") * <p> * groups: * <li> 1: SCRIPT SRC=foo>boo</SCRIPT * <li> 2: just script open tag * <li> 3: STYLE TYPE=moo>zoo</STYLE * <li> 4: just style open tag * <li> 5: entire other tag, without '<' '>' * <li> 6: element * <li> 7: META * <li> 8: !-- comment -- */// version w/ less unnecessary backtracking private static final int MAX_ELEMENT_LENGTH = Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + ".maxElementNameLength", "1024")); static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2 "|((style[^>]*+)>.*?</style)" + // 3, 4 "|(((meta)|(?:\\w{1,"+MAX_ELEMENT_LENGTH+"}))\\s+[^>]*+)" + // 5, 6, 7 "|(!--.*?--))>"; // 8 // version w/ problems with unclosed script tags // static final String RELEVANT_TAG_EXTRACTOR =// "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>"; // // this pattern extracts 'href' or 'src' attributes from// // any open-tag innards matched by the above// static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(// "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");//// // this pattern extracts 'robots' attributes// static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(// "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))"); private static final int MAX_ATTR_NAME_LENGTH = Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + ".maxAttributeNameLength", "1024")); // 1K; static final int MAX_ATTR_VAL_LENGTH = Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + ".maxAttributeValueLength", "16384")); // 16K; // TODO: perhaps cut to near MAX_URI_LENGTH // this pattern extracts attributes from any open-tag innards // matched by the above. attributes known to be URIs of various // sorts are matched specially static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\b((href)|(action)|(on\\w*)" // 1, 2, 3, 4 +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ... +"|(?:usemap)|(?:profile)|(?:datasrc))" // 5 +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9 +"|(value)|(style)|(method)" // 10, 11, 12 +"|([-\\w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 13 +"\\s*=\\s*" +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 14 +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 15 +"|(\\S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 16 // groups: // 1: attribute name // 2: HREF - single URI relative to doc base, or occasionally javascript: // 3: ACTION - single URI relative to doc base, or occasionally javascript: // 4: ON[WHATEVER] - script handler // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC // single URI relative to doc base // 6: CODEBASE - a single URI relative to doc base, affecting other // attributes // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied) // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE // (if supplied) // 9: CODE - a single URI relative to the CODEBASE (is specified). // 10: VALUE - often includes a uri path on forms // 11: STYLE - inline attribute style info // 12: METHOD - form GET/POST // 13: any other attribute // 14: double-quote delimited attr value // 15: single-quote delimited attr value // 16: space-delimited attr value // much like the javascript likely-URI extractor, but // without requiring quotes -- this can indicate whether // an HTML tag attribute that isn't definitionally a // URI might be one anyway, as in form-tag VALUE attributes static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)"; static final String WHITESPACE = "\\s"; static final String CLASSEXT =".class"; static final String APPLET = "applet"; static final String BASE = "base"; static final String LINK = "link"; static final String FRAME = "frame"; static final String IFRAME = "iframe"; public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS = "treat-frames-as-embed-links"; public static final String ATTR_IGNORE_FORM_ACTION_URLS = "ignore-form-action-urls"; public static final String ATTR_EXTRACT_ONLY_FORM_GETS = "extract-only-form-gets"; /** whether to try finding links in Javscript; default true */ public static final String ATTR_EXTRACT_JAVASCRIPT = "extract-javascript"; public static final String EXTRACT_VALUE_ATTRIBUTES = "extract-value-attributes"; public static final String ATTR_IGNORE_UNEXPECTED_HTML = "ignore-unexpected-html"; protected long numberOfCURIsHandled = 0; protected long numberOfLinksExtracted = 0; public ExtractorHTML(String name) { this(name, "HTML extractor. Extracts links from HTML documents"); } public ExtractorHTML(String name, String description) { super(name, description); Type t = addElementToDefinition( new SimpleType(ATTR_EXTRACT_JAVASCRIPT, "If true, in-page Javascript is scanned for strings that " + "appear likely to be URIs. This typically finds both valid " + "and invalid URIs, and attempts to fetch the invalid URIs " + "sometimes generates webmaster concerns over odd crawler " + "behavior. Default is true.", Boolean.TRUE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS, "If true, FRAME/IFRAME SRC-links are treated as embedded " + "resources (like IMG, 'E' hop-type), otherwise they are " + "treated as navigational links. Default is true.", Boolean.TRUE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS, "If true, URIs appearing as the ACTION attribute in " + "HTML FORMs are ignored. Default is false.", Boolean.FALSE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(ATTR_EXTRACT_ONLY_FORM_GETS, "If true, only HTML FORM ACTIONs associated with the GET "+ "method are extracted. (Form ACTIONs with method POST "+ "will be ignored. Default is true", Boolean.TRUE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(EXTRACT_VALUE_ATTRIBUTES, "If true, strings that look like URIs found in element VALUE " + "attributes (which are sometimes used as URIs by in-page " + "Javascript or server-side redirects) will be extracted. " + "This typically finds both valid and invalid URIs, and " + "attempts to fetch the invalid URIs sometimes generate " + "webmaster concerns over odd crawler behavior. Default " + "is true.", Boolean.TRUE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML, "If true, URIs which end in typical non-HTML extensions " + "(such as .gif) will not be scanned as if it were HTML. " + "Default is true.", Boolean.TRUE)); t.setExpertSetting(true); } protected void processGeneralTag(CrawlURI curi, CharSequence element, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); // Just in case it's an OBJECT or APPLET tag String codebase = null; ArrayList<String> resources = null; // Just in case it's a FORM CharSequence action = null; CharSequence actionContext = null; CharSequence method = null; final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi, ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue(); final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue(); final boolean extractValueAttributes = ((Boolean)getUncheckedAttribute (curi, EXTRACT_VALUE_ATTRIBUTES)).booleanValue();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -