📄 extractorhtml.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * SimpleHTMLExtractor.java * Created on Jun 5, 2003 * * $Header: /cvsroot/archive-crawler/ArchiveOpenCrawler/src/java/org/archive/crawler/extractor/ExtractorHTML.java,v 1.79 2006/08/22 03:21:54 stack-sf Exp $ */package org.archive.crawler.extractor;import java.io.IOException;import java.util.ArrayList;import java.util.Iterator;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.httpclient.URIException;import org.archive.crawler.datamodel.CoreAttributeConstants;import org.archive.crawler.datamodel.CrawlURI;import org.archive.crawler.datamodel.RobotsHonoringPolicy;import org.archive.crawler.settings.SimpleType;import org.archive.crawler.settings.Type;import org.archive.io.ReplayCharSequence;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.DevUtils;import org.archive.util.HttpRecorder;import org.archive.util.TextUtils;/** * Basic link-extraction, from an HTML content-body, * using regular expressions. * * @author gojomo * */public class ExtractorHTML extends Extractorimplements CoreAttributeConstants { private static Logger logger = Logger.getLogger(ExtractorHTML.class.getName()); /** * Compiled relevant tag extractor. * * <p> * This pattern extracts either: * <li> (1) whole <script>...</script> or * <li> (2) <style>...</style> or * <li> (3) <meta ...> or * <li> (4) any other open-tag with at least one attribute * (eg matches "<a href='boo'>" but not "</a>" or "<br>") * <p> * groups: * <li> 1: SCRIPT SRC=foo>boo</SCRIPT * <li> 2: just script open tag * <li> 3: STYLE TYPE=moo>zoo</STYLE * <li> 4: just style open tag * <li> 5: entire other tag, without '<' '>' * <li> 6: element * <li> 7: META * <li> 8: !-- comment -- */// version w/ less unnecessary backtracking private static final int MAX_ELEMENT_LENGTH = Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + ".maxElementNameLength", "1024")); static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)" + // 1, 2 "|((style[^>]*+)>[^<]*+</style)" + // 3, 4 "|(((meta)|(?:\\w{1,"+MAX_ELEMENT_LENGTH+"}))\\s+[^>]*+)" + // 5, 6, 7 "|(!--.*?--))>"; // 8 // version w/ problems with unclosed script tags // static final String RELEVANT_TAG_EXTRACTOR =// "(?is)<(?:((script.*?)>.*?</script)|((style.*?)>.*?</style)|(((meta)|(?:\\w+))\\s+.*?)|(!--.*?--))>"; // // this pattern extracts 'href' or 'src' attributes from// // any open-tag innards matched by the above// static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(// "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");//// // this pattern extracts 'robots' attributes// static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(// "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))"); private static final int MAX_ATTR_NAME_LENGTH = Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + ".maxAttributeNameLength", "1024")); // 1K; static final int MAX_ATTR_VAL_LENGTH = Integer.parseInt(System.getProperty(ExtractorHTML.class.getName() + ".maxAttributeValueLength", "16384")); // 16K; // TODO: perhaps cut to near MAX_URI_LENGTH // this pattern extracts attributes from any open-tag innards // matched by the above. attributes known to be URIs of various // sorts are matched specially static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s((href)|(action)|(on\\w*)" // 1, 2, 3, 4 +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" // ... +"|(?:usemap)|(?:profile)|(?:datasrc))" // 5 +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9 +"|(value)|(style)|([-\\w]{1,"+MAX_ATTR_NAME_LENGTH+"}))" // 10, 11, 12 +"\\s*=\\s*" +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:\"|$))" // 13 +"|(?:'(.{0,"+MAX_ATTR_VAL_LENGTH+"}?)(?:'|$))" // 14 +"|(\\S{1,"+MAX_ATTR_VAL_LENGTH+"}))"; // 15 // groups: // 1: attribute name // 2: HREF - single URI relative to doc base, or occasionally javascript: // 3: ACTION - single URI relative to doc base, or occasionally javascript: // 4: ON[WHATEVER] - script handler // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC // single URI relative to doc base // 6: CODEBASE - a single URI relative to doc base, affecting other // attributes // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied) // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE // (if supplied) // 9: CODE - a single URI relative to the CODEBASE (is specified). // 10: VALUE - often includes a uri path on forms // 11: STYLE - inline attribute style info // 12: any other attribute // 13: double-quote delimited attr value // 14: single-quote delimited attr value // 15: space-delimited attr value // much like the javascript likely-URI extractor, but // without requiring quotes -- this can indicate whether // an HTML tag attribute that isn't definitionally a // URI might be one anyway, as in form-tag VALUE attributes static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)"; static final String WHITESPACE = "\\s"; static final String CLASSEXT =".class"; static final String APPLET = "applet"; static final String BASE = "base"; static final String LINK = "link"; static final String FRAME = "frame"; static final String IFRAME = "iframe"; public static final String ATTR_TREAT_FRAMES_AS_EMBED_LINKS = "treat-frames-as-embed-links"; public static final String ATTR_IGNORE_FORM_ACTION_URLS = "ignore-form-action-urls"; public static final String ATTR_OVERLY_EAGER_LINK_DETECTION = "overly-eager-link-detection"; public static final String ATTR_IGNORE_UNEXPECTED_HTML = "ignore-unexpected-html"; protected long numberOfCURIsHandled = 0; protected long numberOfLinksExtracted = 0; public ExtractorHTML(String name) { this(name, "HTML extractor. Extracts links from HTML documents"); } public ExtractorHTML(String name, String description) { super(name, description); Type t = addElementToDefinition( new SimpleType(ATTR_TREAT_FRAMES_AS_EMBED_LINKS, "If enabled, FRAME/IFRAME SRC-links are treated as embedded " + "resources (IMG etc.), otherwise they are treated as " + "navigational links", Boolean.TRUE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(ATTR_IGNORE_FORM_ACTION_URLS, "If enabled, links appearing as the ACTION attribute in " + "HTML FORMs are not extracted.", Boolean.FALSE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(ATTR_OVERLY_EAGER_LINK_DETECTION, "If disabled (default is enabled), possible links will not be "+ "queued if they are placed in (somewhat) unlikely places " + "(Currently, just ignores URLs in HTML value attributes).", Boolean.TRUE)); t.setExpertSetting(true); t = addElementToDefinition( new SimpleType(ATTR_IGNORE_UNEXPECTED_HTML, "If enabled, html that is detected in unusual or unexpected " + "places is not considerd for processing.", Boolean.TRUE)); t.setExpertSetting(true); } protected void processGeneralTag(CrawlURI curi, CharSequence element, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); // Just in case it's an OBJECT or APPLET tag String codebase = null; ArrayList resources = null; final boolean framesAsEmbeds = ((Boolean)getUncheckedAttribute(curi, ATTR_TREAT_FRAMES_AS_EMBED_LINKS)).booleanValue(); final boolean ignoreFormActions = ((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue(); final boolean overlyEagerLinkDetection = ((Boolean)getUncheckedAttribute (curi, ATTR_OVERLY_EAGER_LINK_DETECTION)).booleanValue(); final String elementStr = element.toString(); while (attr.find()) { int valueGroup = (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15; int start = attr.start(valueGroup); int end = attr.end(valueGroup); assert start >= 0: "Start is: " + start + ", " + curi; assert end >= 0: "End is :" + end + ", " + curi; CharSequence value = cs.subSequence(start, end); value = TextUtils.unescapeHtml(value); if (attr.start(2) > -1) { // HREF CharSequence context = Link.elementContext(element, attr.group(2)); if(elementStr.equalsIgnoreCase(LINK)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(curi, value, context); } else { // other HREFs treated as links processLink(curi, value, context); } if (elementStr.equalsIgnoreCase(BASE)) { try { curi.setBaseURI(value.toString()); } catch (URIException e) { if (getController() != null) { // Controller can be null: e.g. when running // ExtractorTool. getController().logUriError(e, curi.getUURI(), value.toString()); } else { logger.info("Failed set base uri: " + curi + ", " + value.toString() + ": " + e.getMessage()); } } } } else if (attr.start(3) > -1) { // ACTION if (!ignoreFormActions) { CharSequence context = Link.elementContext(element, attr.group(3)); processLink(curi, value, context); } } else if (attr.start(4) > -1) { // ON____ processScriptCode(curi, value); // TODO: context? } else if (attr.start(5) > -1) { // SRC etc. CharSequence context = Link.elementContext(element, attr.group(5)); // true, if we expect another HTML page instead of an image etc. final char hopType; if(!framesAsEmbeds && (elementStr.equalsIgnoreCase(FRAME) || elementStr .equalsIgnoreCase(IFRAME))) { hopType = Link.NAVLINK_HOP; } else { hopType = Link.EMBED_HOP; } processEmbed(curi, value, context, hopType); } else if (attr.start(6) > -1) { // CODEBASE codebase = (value instanceof String)? (String)value: value.toString(); CharSequence context = Link.elementContext(element, attr.group(6)); processEmbed(curi, codebase, context); } else if (attr.start(7) > -1) { // CLASSID, DATA if (resources == null) { resources = new ArrayList(); } resources.add(value.toString()); } else if (attr.start(8) > -1) { // ARCHIVE if (resources==null) { resources = new ArrayList(); } String[] multi = TextUtils.split(WHITESPACE, value); for(int i = 0; i < multi.length; i++ ) { resources.add(multi[i]); } } else if (attr.start(9) > -1) { // CODE if (resources==null) { resources = new ArrayList(); } // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (elementStr.equalsIgnoreCase(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) { resources.add(value.toString() + CLASSEXT); } else { resources.add(value.toString()); } } else if (attr.start(10) > -1) { // VALUE, with possibility of URI if (TextUtils.matches(LIKELY_URI_PATH, value) && overlyEagerLinkDetection) { CharSequence context = Link.elementContext(element, attr.group(10)); processLink(curi,value, context); } } else if (attr.start(11) > -1) { // STYLE inline attribute // then, parse for URIs this.numberOfLinksExtracted += ExtractorCSS.processStyleCode( curi, value, getController()); } else if (attr.start(12) > -1) { // any other attribute // ignore for now // could probe for path- or script-looking strings, but // those should be vanishingly rare in other attributes, // and/or symptomatic of page bugs } } TextUtils.recycleMatcher(attr); // handle codebase/resources if (resources == null) { return; } Iterator iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory. getInstance(curi.getUURI(), codebase); } while(iter.hasNext()) { res = iter.next().toString(); res = (String) TextUtils.unescapeHtml(res); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -