📄 extractorhtml.java
字号:
} processEmbed(curi, res, element); // TODO: include attribute too } } catch (URIException e) { curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } } // finds strings in javascript likely to be URIs/paths // guessing based on '.' in string, so if highly likely to // get gifs/etc, unable to get many other paths // will find false positives // TODO: add '/' check, suppress strings being concatenated via '+'? static final String JAVASCRIPT_LIKELY_URI_EXTRACTOR = "(\\\\{0,8}+\"|\\\\{0,8}+\')(\\.{0,2}[^+\\.\\n\\r\\s\"\']+[^\\.\\n\\r\\s\"\']*(\\.[^\\.\\n\\r\\s\"\']+)+)(\\1)"; /** * @param curi * @param cs */ protected void processScriptCode(CrawlURI curi, CharSequence cs) { this.numberOfLinksExtracted += ExtractorJS.considerStrings(curi, cs, getController(), false); } static final String JAVASCRIPT = "(?i)^javascript:.*"; /** * Handle generic HREF cases. * * @param curi * @param value * @param context */ protected void processLink(CrawlURI curi, final CharSequence value, CharSequence context) { if (TextUtils.matches(JAVASCRIPT, value)) { processScriptCode(curi, value. subSequence(11, value.length())); } else { if (logger.isLoggable(Level.FINEST)) { logger.finest("link: " + value.toString() + " from " + curi); } addLinkFromString(curi, (value instanceof String)? (String)value: value.toString(), context, Link.NAVLINK_HOP); this.numberOfLinksExtracted++; } } private void addLinkFromString(CrawlURI curi, String uri, CharSequence context, char hopType) { try { // We do a 'toString' on context because its a sequence from // the underlying ReplayCharSequence and the link its about // to become a part of is expected to outlive the current // ReplayCharSequence. curi.createAndAddLinkRelativeToBase(uri, context.toString(), hopType); } catch (URIException e) { if (getController() != null) { getController().logUriError(e, curi.getUURI(), uri); } else { logger.info("Failed createAndAddLinkRelativeToBase " + curi + ", " + uri + ", " + context + ", " + hopType + ": " + e); } } } protected final void processEmbed(CrawlURI curi, CharSequence value, CharSequence context) { processEmbed(curi, value, context, Link.EMBED_HOP); } protected void processEmbed(CrawlURI curi, final CharSequence value, CharSequence context, char hopType) { if (logger.isLoggable(Level.FINEST)) { logger.finest("embed (" + hopType + "): " + value.toString() + " from " + curi); } addLinkFromString(curi, (value instanceof String)? (String)value: value.toString(), context, hopType); this.numberOfLinksExtracted++; } public void extract(CrawlURI curi) { if (!isHttpTransactionContentToProcess(curi) || ! (isExpectedMimeType(curi.getContentType(), "text/html") || isExpectedMimeType(curi.getContentType(), "application/xhtml"))) { return; } final boolean ignoreUnexpectedHTML = ((Boolean)getUncheckedAttribute(curi, ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue(); if (ignoreUnexpectedHTML) { try { if(!isHtmlExpectedHere(curi)) { // HTML was not expected (eg a GIF was expected) so ignore // (as if a soft 404) return; } } catch (URIException e) { logger.severe("Failed expectedHTML test: " + e.getMessage()); } } this.numberOfCURIsHandled++; ReplayCharSequence cs = null; try { HttpRecorder hr = curi.getHttpRecorder(); if (hr == null) { throw new IOException("Why is recorder null here?"); } cs = hr.getReplayCharSequence(); } catch (IOException e) { curi.addLocalizedError(this.getName(), e, "Failed get of replay char sequence " + curi.toString() + " " + e.getMessage()); logger.log(Level.SEVERE,"Failed get of replay char sequence in " + Thread.currentThread().getName(), e); } if (cs == null) { return; } // We have a ReplayCharSequence open. Wrap all in finally so we // for sure close it before we leave. try { // Extract all links from the charsequence extract(curi, cs); // Set flag to indicate that link extraction is completed. curi.linkExtractorFinished(); } finally { if (cs != null) { try { cs.close(); } catch (IOException ioe) { logger.warning(TextUtils.exceptionToString( "Failed close of ReplayCharSequence.", ioe)); } } } } /** * Run extractor. * This method is package visible to ease testing. * @param curi CrawlURI we're processing. * @param cs Sequence from underlying ReplayCharSequence. This * is TRANSIENT data. Make a copy if you want the data to live outside * of this extractors' lifetime. */ void extract(CrawlURI curi, CharSequence cs) { Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs); while(tags.find()) { if(Thread.interrupted()){ break; } if (tags.start(8) > 0) { // comment match // for now do nothing } else if (tags.start(7) > 0) { // <meta> match int start = tags.start(5); int end = tags.end(5); assert start >= 0: "Start is: " + start + ", " + curi; assert end >= 0: "End is :" + end + ", " + curi; if (processMeta(curi, cs.subSequence(start, end))) { // meta tag included NOFOLLOW; abort processing break; } } else if (tags.start(5) > 0) { // generic <whatever> match int start5 = tags.start(5); int end5 = tags.end(5); assert start5 >= 0: "Start is: " + start5 + ", " + curi; assert end5 >= 0: "End is :" + end5 + ", " + curi; int start6 = tags.start(6); int end6 = tags.end(6); assert start6 >= 0: "Start is: " + start6 + ", " + curi; assert end6 >= 0: "End is :" + end6 + ", " + curi; processGeneralTag(curi, cs.subSequence(start6, end6), cs.subSequence(start5, end5)); } else if (tags.start(1) > 0) { // <script> match int start = tags.start(1); int end = tags.end(1); assert start >= 0: "Start is: " + start + ", " + curi; assert end >= 0: "End is :" + end + ", " + curi; assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) + ", " + curi; processScript(curi, cs.subSequence(start, end), tags.end(2) - start); } else if (tags.start(3) > 0){ // <style... match int start = tags.start(3); int end = tags.end(3); assert start >= 0: "Start is: " + start + ", " + curi; assert end >= 0: "End is :" + end + ", " + curi; assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) + ", " + curi; processStyle(curi, cs.subSequence(start, end), tags.end(4) - start); } } TextUtils.recycleMatcher(tags); } static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+ "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)"; /** * Test whether this HTML is so unexpected (eg in place of a GIF URI) * that it shouldn't be scanned for links. * * @param curi CrawlURI to examine. * @return True if HTML is acceptable/expected here * @throws URIException */ protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException { String path = curi.getUURI().getPath(); if(path==null) { // no path extension, HTML is fine return true; } int dot = path.lastIndexOf('.'); if (dot < 0) { // no path extension, HTML is fine return true; } if(dot<(path.length()-5)) { // extension too long to recognize, HTML is fine return true; } String ext = path.substring(dot+1); return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext); } protected void processScript(CrawlURI curi, CharSequence sequence, int endOfOpenTag) { // for now, do nothing // TODO: best effort extraction of strings // first, get attributes of script-open tag // as per any other tag processGeneralTag(curi,sequence.subSequence(0,6), sequence.subSequence(0,endOfOpenTag)); // then, apply best-effort string-analysis heuristics // against any code present (false positives are OK) processScriptCode( curi, sequence.subSequence(endOfOpenTag, sequence.length())); } /** * Process metadata tags. * @param curi CrawlURI we're processing. * @param cs Sequence from underlying ReplayCharSequence. This * is TRANSIENT data. Make a copy if you want the data to live outside * of this extractors' lifetime. * @return True robots exclusion metatag. */ protected boolean processMeta(CrawlURI curi, CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); String name = null; String httpEquiv = null; String content = null; while (attr.find()) { int valueGroup = (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15; CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup)); if (attr.group(1).equalsIgnoreCase("name")) { name = value.toString(); } else if (attr.group(1).equalsIgnoreCase("http-equiv")) { httpEquiv = value.toString(); } else if (attr.group(1).equalsIgnoreCase("content")) { content = value.toString(); } // TODO: handle other stuff } TextUtils.recycleMatcher(attr); // Look for the 'robots' meta-tag if("robots".equalsIgnoreCase(name) && content != null ) { curi.putString(A_META_ROBOTS, content); RobotsHonoringPolicy policy = getSettingsHandler().getOrder().getRobotsHonoringPolicy(); String contentLower = content.toLowerCase(); if ((policy == null || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE) && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM))) && (contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) { // if 'nofollow' or 'none' is specified and the // honoring policy is not IGNORE or CUSTOM, end html extraction logger.fine("HTML extraction skipped due to robots meta-tag for: " + curi.toString()); return true; } } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) { String refreshUri = content.substring(content.indexOf("=") + 1); try { curi.createAndAddLinkRelativeToBase(refreshUri, "meta", Link.REFER_HOP); } catch (URIException e) { if (getController() != null) { getController().logUriError(e, curi.getUURI(), refreshUri); } else { logger.info("Failed createAndAddLinkRelativeToBase " + curi + ", " + cs + ", " + refreshUri + ": " + e); } } } return false; } /** * Process style text. * @param curi CrawlURI we're processing. * @param sequence Sequence from underlying ReplayCharSequence. This * is TRANSIENT data. Make a copy if you want the data to live outside * of this extractors' lifetime. * @param endOfOpenTag */ protected void processStyle(CrawlURI curi, CharSequence sequence, int endOfOpenTag) { // First, get attributes of script-open tag as per any other tag. processGeneralTag(curi, sequence.subSequence(0,6), sequence.subSequence(0,endOfOpenTag)); // then, parse for URIs this.numberOfLinksExtracted += ExtractorCSS.processStyleCode( curi, sequence.subSequence(endOfOpenTag,sequence.length()), getController()); } /* (non-Javadoc) * @see org.archive.crawler.framework.Processor#report() */ public String report() { StringBuffer ret = new StringBuffer(); ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n"); ret.append(" Function: Link extraction on HTML documents\n");
ret.append(" CrawlURIs handled: " + this.numberOfCURIsHandled + "\n"); ret.append(" Links extracted: " + this.numberOfLinksExtracted + "\n\n"); return ret.toString(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -