📄 regexphtmllinkextractor.java
字号:
// CLASSID, DATA if (resources == null) { resources = new ArrayList<String>(); } resources.add(value.toString()); } else if (attr.start(8) > -1) { // ARCHIVE if (resources==null) { resources = new ArrayList<String>(); } String[] multi = TextUtils.split(WHITESPACE, value); for(int i = 0; i < multi.length; i++ ) { resources.add(multi[i]); } } else if (attr.start(9) > -1) { // CODE if (resources==null) { resources = new ArrayList<String>(); } // If element is applet and code value does not end with // '.class' then append '.class' to the code value. if (element.toString().toLowerCase().equals(APPLET) && !value.toString().toLowerCase().endsWith(CLASSEXT)) { resources.add(value.toString() + CLASSEXT); } else { resources.add(value.toString()); } } else if (attr.start(10) > -1) { // VALUE if(TextUtils.matches(LIKELY_URI_PATH, value)) { CharSequence context = Link.elementContext(element, attr.group(10)); processLink(value, context); } } else if (attr.start(11) > -1) { // any other attribute // ignore for now // could probe for path- or script-looking strings, but // those should be vanishingly rare in other attributes, // and/or symptomatic of page bugs } } TextUtils.recycleMatcher(attr); // handle codebase/resources if (resources == null) { return (tally-next.size())>0; } Iterator iter = resources.iterator(); UURI codebaseURI = null; String res = null; try { if (codebase != null) { // TODO: Pass in the charset. codebaseURI = UURIFactory.getInstance(base, codebase); } while(iter.hasNext()) { res = iter.next().toString(); // TODO: more HTML deescaping? res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP); if (codebaseURI != null) { res = codebaseURI.resolve(res).toString(); } processEmbed(res, element); // TODO: include attribute too } } catch (URIException e) { extractErrorListener.noteExtractError(e,source,codebase); } catch (IllegalArgumentException e) { DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" + "codebase=" + codebase + " res=" + res + "\n" + DevUtils.extraInfo(), e); } return (tally-next.size())>0; } /** * @param cs */ protected void processScriptCode(CharSequence cs) { RegexpJSLinkExtractor.extract(cs, source, base, next, extractErrorListener); } static final String JAVASCRIPT = "(?i)^javascript:.*"; /** * @param value * @param context */ protected void processLink(CharSequence value, CharSequence context) { String link = TextUtils.replaceAll(ESCAPED_AMP, value, "&"); if(TextUtils.matches(JAVASCRIPT, link)) { processScriptCode(value.subSequence(11, value.length())); } else { addLinkFromString(link, context,Link.NAVLINK_HOP); } } /** * @param uri * @param context */ private void addLinkFromString(String uri, CharSequence context, char hopType) { try { Link link = new Link(source, UURIFactory.getInstance( base, uri), context, hopType); next.addLast(link); } catch (URIException e) { extractErrorListener.noteExtractError(e,source,uri); } } protected long processEmbed(CharSequence value, CharSequence context) { String embed = TextUtils.replaceAll(ESCAPED_AMP, value, "&"); addLinkFromString(embed, context,Link.EMBED_HOP); return 1; } static final String NON_HTML_PATH_EXTENSION = "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+ "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)"; protected void processScript(CharSequence sequence, int endOfOpenTag) { // first, get attributes of script-open tag // as per any other tag processGeneralTag(sequence.subSequence(0,6), sequence.subSequence(0,endOfOpenTag)); // then, apply best-effort string-analysis heuristics // against any code present (false positives are OK) processScriptCode(sequence.subSequence(endOfOpenTag, sequence.length())); } protected void processMeta(CharSequence cs) { Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs); String name = null; String httpEquiv = null; String content = null; while (attr.find()) { int valueGroup = (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14; CharSequence value = cs.subSequence(attr.start(valueGroup), attr.end(valueGroup)); if (attr.group(1).equalsIgnoreCase("name")) { name = value.toString(); } else if (attr.group(1).equalsIgnoreCase("http-equiv")) { httpEquiv = value.toString(); } else if (attr.group(1).equalsIgnoreCase("content")) { content = value.toString(); } // TODO: handle other stuff } TextUtils.recycleMatcher(attr); // Look for the 'robots' meta-tag if("robots".equalsIgnoreCase(name) && content != null ) { if (getHonorRobots()) { String contentLower = content.toLowerCase(); if ((contentLower.indexOf("nofollow") >= 0 || contentLower.indexOf("none") >= 0)) { // if 'nofollow' or 'none' is specified and we // are honoring robots, end html extraction logger.fine("HTML extraction skipped due to robots meta-tag for: " + source); cancelFurtherExtraction(); return; } } } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) { String refreshUri = content.substring(content.indexOf("=") + 1); try { Link refreshLink = new Link(source, UURIFactory.getInstance(base,refreshUri), Link.elementContext("meta",httpEquiv),Link.REFER_HOP); next.addLast(refreshLink); } catch (URIException e) { extractErrorListener.noteExtractError(e,source,refreshUri); } } } /** * @return whether to honor internal robots directives (eg meta robots) */ private boolean getHonorRobots() { return honorRobots; } /** * Ensure no further Links are extracted (by setting matcher up to fail) */ private void cancelFurtherExtraction() { // java 1.5 only: // tags.region(tags.regionEnd(),tags.regionEnd()); tags.reset(""); } /** * @param sequence * @param endOfOpenTag */ protected void processStyle(CharSequence sequence, int endOfOpenTag) { // First, get attributes of script-open tag as per any other tag. processGeneralTag(sequence.subSequence(0,6), sequence.subSequence(0,endOfOpenTag)); // then, parse for URIs RegexpCSSLinkExtractor.extract(sequence.subSequence(endOfOpenTag, sequence.length()), source, base, next, extractErrorListener); } /** * Discard all state. Another setup() is required to use again. */ public void reset() { super.reset(); TextUtils.recycleMatcher(tags); tags = null; } protected static CharSequenceLinkExtractor newDefaultInstance() { return new RegexpHTMLLinkExtractor(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -