📄 regexphtmllinkextractor.java
字号:
/* Copyright (C) 2003 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * SimpleHTMLExtractor.java * Created on Jun 5, 2003 * * $Header$ */package org.archive.extractor;import java.util.ArrayList;import java.util.Iterator;import java.util.LinkedList;import java.util.logging.Level;import java.util.logging.Logger;import java.util.regex.Matcher;import org.apache.commons.httpclient.URIException;import org.archive.crawler.extractor.Link;import org.archive.net.UURI;import org.archive.net.UURIFactory;import org.archive.util.DevUtils;import org.archive.util.TextUtils;/** * Basic link-extraction, from an HTML content-body, * using regular expressions. * * ROUGH DRAFT IN PROGRESS / incomplete... untested... * * @author gojomo */public class RegexpHTMLLinkExtractor extends CharSequenceLinkExtractor { private static Logger logger = Logger.getLogger(RegexpHTMLLinkExtractor.class.getName()); boolean honorRobots = true; boolean extractInlineCss = true; boolean extractInlineJs = true; protected LinkedList<Link> next = new LinkedList<Link>(); protected Matcher tags; /* (non-Javadoc) * @see org.archive.extractor.CharSequenceLinkExtractor#findNextLink() */ protected boolean findNextLink() { if (tags == null) { tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, sourceContent); } while(tags.find()) { if(Thread.interrupted()){ // TODO: throw an exception, perhaps, rather than just clear & break? break; } if (tags.start(8) > 0) { // comment match // for now do nothing } else if (tags.start(7) > 0) { // <meta> match int start = tags.start(5); int end = tags.end(5); processMeta(sourceContent.subSequence(start, end)); } else if (tags.start(5) > 0) { // generic <whatever> match int start5 = tags.start(5); int end5 = tags.end(5); int start6 = tags.start(6); int end6 = tags.end(6); processGeneralTag(sourceContent.subSequence(start6, end6), sourceContent.subSequence(start5, end5)); } else if (tags.start(1) > 0) { // <script> match int start = tags.start(1); int end = tags.end(1); processScript(sourceContent.subSequence(start, end), tags.end(2) - start); } else if (tags.start(3) > 0){ // <style... match int start = tags.start(3); int end = tags.end(3); processStyle(sourceContent.subSequence(start, end), tags.end(4) - start); } if(!next.isEmpty()) { // at least one link found return true; } } // no relevant tags found return false; } /** * Compiled relevant tag extractor. * * <p> * This pattern extracts either: * <li> (1) whole <script>...</script> or * <li> (2) <style>...</style> or * <li> (3) <meta ...> or * <li> (4) any other open-tag with at least one attribute * (eg matches "<a href='boo'>" but not "</a>" or "<br>") * <p> * groups: * <li> 1: SCRIPT SRC=foo>boo</SCRIPT * <li> 2: just script open tag * <li> 3: STYLE TYPE=moo>zoo</STYLE * <li> 4: just style open tag * <li> 5: entire other tag, without '<' '>' * <li> 6: element * <li> 7: META * <li> 8: !-- comment -- */ static final String RELEVANT_TAG_EXTRACTOR = "(?is)<(?:((script[^>]*+)>.*?</script)|((style[^>]*+)>[^<]*+</style)|(((meta)|(?:\\w+))\\s+[^>]*+)|(!--.*?--))>"; // this pattern extracts attributes from any open-tag innards // matched by the above. attributes known to be URIs of various // sorts are matched specially static final String EACH_ATTRIBUTE_EXTRACTOR = "(?is)\\s((href)|(action)|(on\\w*)" +"|((?:src)|(?:lowsrc)|(?:background)|(?:cite)|(?:longdesc)" +"|(?:usemap)|(?:profile)|(?:datasrc)|(?:for))" +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" +"|(value)|([-\\w]+))" +"\\s*=\\s*" +"(?:(?:\"(.*?)(?:\"|$))" +"|(?:'(.*?)(?:'|$))" +"|(\\S+))"; // groups: // 1: attribute name // 2: HREF - single URI relative to doc base, or occasionally javascript: // 3: ACTION - single URI relative to doc base, or occasionally javascript: // 4: ON[WHATEVER] - script handler // 5: SRC,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE,DATASRC, or FOR // single URI relative to doc base // 6: CODEBASE - a single URI relative to doc base, affecting other // attributes // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied) // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE // (if supplied) // 9: CODE - a single URI relative to the CODEBASE (is specified). // 10: VALUE - often includes a uri path on forms // 11: any other attribute // 12: double-quote delimited attr value // 13: single-quote delimited attr value // 14: space-delimited attr value // much like the javascript likely-URI extractor, but // without requiring quotes -- this can indicate whether // an HTML tag attribute that isn't definitionally a // URI might be one anyway, as in form-tag VALUE attributes static final String LIKELY_URI_PATH = "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)"; static final String ESCAPED_AMP = "&"; static final String AMP ="&"; static final String WHITESPACE = "\\s"; static final String CLASSEXT =".class"; static final String APPLET = "applet"; static final String BASE = "base"; static final String LINK = "link"; protected boolean processGeneralTag(CharSequence element, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); // Just in case it's an OBJECT or APPLET tag String codebase = null; ArrayList<String> resources = null; long tally = next.size(); while (attr.find()) { int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14; int start = attr.start(valueGroup); int end = attr.end(valueGroup); CharSequence value = cs.subSequence(start, end); if (attr.start(2) > -1) { // HREF CharSequence context = Link.elementContext(element, attr.group(2)); if(element.toString().equalsIgnoreCase(LINK)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(value, context); } else { if (element.toString().equalsIgnoreCase(BASE)) { try { base = UURIFactory.getInstance(value.toString()); } catch (URIException e) { extractErrorListener.noteExtractError(e,source,value); } } // other HREFs treated as links processLink(value, context); } } else if (attr.start(3) > -1) { // ACTION CharSequence context = Link.elementContext(element, attr.group(3)); processLink(value, context); } else if (attr.start(4) > -1) { // ON____ processScriptCode(value); // TODO: context? } else if (attr.start(5) > -1) { // SRC etc. CharSequence context = Link.elementContext(element, attr.group(5)); processEmbed(value, context); } else if (attr.start(6) > -1) { // CODEBASE // TODO: more HTML deescaping? codebase = TextUtils.replaceAll(ESCAPED_AMP, value, AMP); CharSequence context = Link.elementContext(element,attr.group(6)); processEmbed(codebase, context); } else if (attr.start(7) > -1) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -