📄 extractorhtml.java
字号:
final String elementStr = element.toString(); while (attr.find()) { int valueGroup = (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16; int start = attr.start(valueGroup); int end = attr.end(valueGroup); assert start >= 0: "Start is: " + start + ", " + curi; assert end >= 0: "End is :" + end + ", " + curi; CharSequence value = cs.subSequence(start, end); value = TextUtils.unescapeHtml(value); if (attr.start(2) > -1) { // HREF CharSequence context = Link.elementContext(element, attr.group(2)); if(elementStr.equalsIgnoreCase(LINK)) { // <LINK> elements treated as embeds (css, ico, etc) processEmbed(curi, value, context); } else { // other HREFs treated as links processLink(curi, value, context); } if (elementStr.equalsIgnoreCase(BASE)) { try { curi.setBaseURI(value.toString()); } catch (URIException e) { if (getController() != null) { // Controller can be null: e.g. when running // ExtractorTool. getController().logUriError(e, curi.getUURI(), value.toString()); } else { logger.info("Failed set base uri: " + curi + ", " + value.toString() + ": " + e.getMessage()); } } } } else if (attr.start(3) > -1) { // ACTION if (!ignoreFormActions) { action = value; actionContext = Link.elementContext(element, attr.group(3)); // handling finished only at end (after METHOD also collected) } } else if (attr.start(4) > -1) { // ON____ processScriptCode(curi, value); // TODO: context? } else if (attr.start(5) > -1) { // SRC etc. CharSequence context = Link.elementContext(element, attr.group(5)); // true, if we expect another HTML page instead of an image etc. final char hopType; if(!framesAsEmbeds && (elementStr.equalsIgnoreCase(FRAME) || elementStr .equalsIgnoreCase(IFRAME))) { hopType = Link.NAVLINK_HOP; } else { hopType = Link.EMBED_HOP; } processEmbed(curi, value, context, hopType); } else if (attr.start(6) > -1) { // CODEBASE codebase = (value instanceof String)? (String)value: value.toString(); CharSequence context = Link.elementContext(element, attr.group(6)); processEmbed(curi, codebase, context); } else if (attr.start(7) > -1) { // CLASSID, DATA if (resources == null) { resources = new ArrayList<String>(); } resources.add(value.toString()); } else if (attr.start(8) > -1) { // ARCHIVE if (resources==null) { resources = new ArrayList<String>(); } String[] multi = TextUtils.split(WHITESPACE, value); for(int i = 0; i < multi.length; i++ ) { resources.add(multi[i]); } } else if (attr.start(9) > -1) { // CODE if (resources==null) { resources = new ArrayList<String>(); } // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (elementStr.equalsIgnoreCase(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) { resources.add(value.toString() + CLASSEXT); } else { resources.add(value.toString()); } } else if (attr.start(10) > -1) { // VALUE, with possibility of URI if (extractValueAttributes && TextUtils.matches(LIKELY_URI_PATH, value)) { CharSequence context = Link.elementContext(element, attr.group(10)); processLink(curi,value, context); } } else if (attr.start(11) > -1) { // STYLE inline attribute // then, parse for URIs this.numberOfLinksExtracted += ExtractorCSS.processStyleCode( curi, value, getController()); } else if (attr.start(12) > -1) { // METHOD method = value; // form processing finished at end (after ACTION also collected) } else if (attr.start(13) > -1) { // any other attribute // ignore for now // could probe for path- or script-looking strings, but // those should be vanishingly rare in other attributes, // and/or symptomatic of page bugs } } TextUtils.recycleMatcher(attr); // finish handling codebase/resources now that all available if (resources != null) { Iterator iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory. getInstance(curi.getUURI(), codebase); } while(iter.hasNext()) { res = iter.next().toString(); res = (String) TextUtils.unescapeHtml(res); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); } processEmbed(curi, res, element); // TODO: include attribute too } } catch (URIException e) { curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } } // finish handling form action, now method is available if(action != null) { if(method == null || "GET".equalsIgnoreCase(method.toString()) || ! ((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) { processLink(curi, action, actionContext); } } } /** * Extract the (java)script source in the given CharSequence. * * @param curi source CrawlURI * @param cs CharSequence of javascript code */ protected void processScriptCode(CrawlURI curi, CharSequence cs) { if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) { this.numberOfLinksExtracted += ExtractorJS.considerStrings(curi, cs, getController(), false); } // else do nothing } static final String JAVASCRIPT = "(?i)^javascript:.*"; /** * Handle generic HREF cases. * * @param curi * @param value * @param context */ protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) { if (TextUtils.matches(JAVASCRIPT, value)) { processScriptCode(curi, value. subSequence(11, value.length())); } else { if (logger.isLoggable(Level.FINEST)) { logger.finest("link: " + value.toString() + " from " + curi); } addLinkFromString(curi, (value instanceof String)? (String)value: value.toString(), context, Link.NAVLINK_HOP); this.numberOfLinksExtracted++; } } private void addLinkFromString(CrawlURI curi, String uri, CharSequence context, char hopType) { try { // We do a 'toString' on context because its a sequence from // the underlying ReplayCharSequence and the link its about // to become a part of is expected to outlive the current // ReplayCharSequence. curi.createAndAddLinkRelativeToBase(uri, context.toString(), hopType); } catch (URIException e) { if (getController() != null) { getController().logUriError(e, curi.getUURI(), uri); } else { logger.info("Failed createAndAddLinkRelativeToBase " + curi + ", " + uri + ", " + context + ", " + hopType + ": " + e); } } } protected final void processEmbed(CrawlURI curi, CharSequence value, CharSequence context) { processEmbed(curi, value, context, Link.EMBED_HOP); } protected void processEmbed(CrawlURI curi, final CharSequence value, CharSequence context, char hopType) { if (logger.isLoggable(Level.FINEST)) { logger.finest("embed (" + hopType + "): " + value.toString() + " from " + curi); } addLinkFromString(curi, (value instanceof String)? (String)value: value.toString(), context, hopType); this.numberOfLinksExtracted++; } public void extract(CrawlURI curi) { if (!isHttpTransactionContentToProcess(curi) || ! (isExpectedMimeType(curi.getContentType(), "text/html") || isExpectedMimeType(curi.getContentType(), "application/xhtml") || isExpectedMimeType(curi.getContentType(), "text/vnd.wap.wml") || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.wml") || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.xhtml"))) { return; } final boolean ignoreUnexpectedHTML = ((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue(); if (ignoreUnexpectedHTML) { try { if(!isHtmlExpectedHere(curi)) { // HTML was not expected (eg a GIF was expected) so ignore // (as if a soft 404) return; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -