📄 regexphtmllinkextractor.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
                // CLASSID, DATA                if (resources == null) {                    resources = new ArrayList<String>();                }                resources.add(value.toString());            } else if (attr.start(8) > -1) {                // ARCHIVE                if (resources==null) {                    resources = new ArrayList<String>();                }                String[] multi = TextUtils.split(WHITESPACE, value);                for(int i = 0; i < multi.length; i++ ) {                    resources.add(multi[i]);                }            } else if (attr.start(9) > -1) {                // CODE                if (resources==null) {                    resources = new ArrayList<String>();                }                // If element is applet and code value does not end with                // '.class' then append '.class' to the code value.                if (element.toString().toLowerCase().equals(APPLET) &&                        !value.toString().toLowerCase().endsWith(CLASSEXT)) {                    resources.add(value.toString() + CLASSEXT);                } else {                    resources.add(value.toString());                }            } else if (attr.start(10) > -1) {                // VALUE                if(TextUtils.matches(LIKELY_URI_PATH, value)) {                    CharSequence context = Link.elementContext(element, attr.group(10));                    processLink(value, context);                }            } else if (attr.start(11) > -1) {                // any other attribute                // ignore for now                // could probe for path- or script-looking strings, but                // those should be vanishingly rare in other attributes,                // and/or symptomatic of page bugs            }        }        TextUtils.recycleMatcher(attr);        // handle codebase/resources        if (resources == null) {            return (tally-next.size())>0;        }        Iterator iter = resources.iterator();        UURI codebaseURI = null;        String res = null;        try {            if (codebase != null) {                // TODO: Pass in the charset.                codebaseURI = UURIFactory.getInstance(base, codebase);            }            while(iter.hasNext()) {                res = iter.next().toString();                // TODO: more HTML deescaping?                res = TextUtils.replaceAll(ESCAPED_AMP, res, AMP);                if (codebaseURI != null) {                    res = codebaseURI.resolve(res).toString();                }                processEmbed(res, element); // TODO: include attribute too            }        } catch (URIException e) {            extractErrorListener.noteExtractError(e,source,codebase);        } catch (IllegalArgumentException e) {            DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +                "codebase=" + codebase + " res=" + res + "\n" +                DevUtils.extraInfo(), e);        }        return (tally-next.size())>0;    }    /**     * @param cs     */    protected void processScriptCode(CharSequence cs) {        RegexpJSLinkExtractor.extract(cs, source, base, next,                extractErrorListener);    }    static final String JAVASCRIPT = "(?i)^javascript:.*";    /**     * @param value     * @param context     */    protected void processLink(CharSequence value, CharSequence context) {        String link = TextUtils.replaceAll(ESCAPED_AMP, value, "&");        if(TextUtils.matches(JAVASCRIPT, link)) {            processScriptCode(value.subSequence(11, value.length()));        } else {            addLinkFromString(link, context,Link.NAVLINK_HOP);        }    }    /**     * @param uri     * @param context     */    private void addLinkFromString(String uri, CharSequence context, char hopType) {        try {            Link link = new Link(source, UURIFactory.getInstance(                    base, uri), context, hopType);            next.addLast(link);        } catch (URIException e) {           extractErrorListener.noteExtractError(e,source,uri);        }    }    protected long processEmbed(CharSequence value, CharSequence context) {        String embed = TextUtils.replaceAll(ESCAPED_AMP, value, "&");        addLinkFromString(embed, context,Link.EMBED_HOP);        return 1;    }    static final String NON_HTML_PATH_EXTENSION =        "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+        "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";    protected void processScript(CharSequence sequence, int endOfOpenTag) {        // first, get attributes of script-open tag        // as per any other tag        processGeneralTag(sequence.subSequence(0,6),            sequence.subSequence(0,endOfOpenTag));        // then, apply best-effort string-analysis heuristics        // against any code present (false positives are OK)        processScriptCode(sequence.subSequence(endOfOpenTag, sequence.length()));    }    protected void processMeta(CharSequence cs) {        Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);        String name = null;        String httpEquiv = null;        String content = null;        while (attr.find()) {            int valueGroup =                (attr.start(12) > -1) ? 12 : (attr.start(13) > -1) ? 13 : 14;            CharSequence value =                cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));            if (attr.group(1).equalsIgnoreCase("name")) {                name = value.toString();            } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {                httpEquiv = value.toString();            } else if (attr.group(1).equalsIgnoreCase("content")) {                content = value.toString();            }            // TODO: handle other stuff        }        TextUtils.recycleMatcher(attr);                // Look for the 'robots' meta-tag        if("robots".equalsIgnoreCase(name) && content != null ) {            if (getHonorRobots())  {            String contentLower = content.toLowerCase();                if ((contentLower.indexOf("nofollow") >= 0                        || contentLower.indexOf("none") >= 0)) {                    // if 'nofollow' or 'none' is specified and we                    // are honoring robots, end html extraction                    logger.fine("HTML extraction skipped due to robots meta-tag for: "                                    + source);                    cancelFurtherExtraction();                    return;                }            }        } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {            String refreshUri = content.substring(content.indexOf("=") + 1);            try {                Link refreshLink = new Link(source, UURIFactory.getInstance(base,refreshUri), Link.elementContext("meta",httpEquiv),Link.REFER_HOP);                next.addLast(refreshLink);            } catch (URIException e) {                extractErrorListener.noteExtractError(e,source,refreshUri);            }        }    }    /**     * @return whether to honor internal robots directives (eg meta robots)     */    private boolean getHonorRobots() {        return honorRobots;    }    /**     * Ensure no further Links are extracted (by setting matcher up to fail)     */    private void cancelFurtherExtraction() {        // java 1.5 only:        // tags.region(tags.regionEnd(),tags.regionEnd());        tags.reset("");     }    /**     * @param sequence     * @param endOfOpenTag     */    protected void processStyle(CharSequence sequence,            int endOfOpenTag)    {        // First, get attributes of script-open tag as per any other tag.        processGeneralTag(sequence.subSequence(0,6),            sequence.subSequence(0,endOfOpenTag));        // then, parse for URIs        RegexpCSSLinkExtractor.extract(sequence.subSequence(endOfOpenTag,                sequence.length()), source, base, next, extractErrorListener);    }    /**     * Discard all state. Another setup() is required to use again.     */    public void reset() {        super.reset();        TextUtils.recycleMatcher(tags);        tags = null;    }    protected static CharSequenceLinkExtractor newDefaultInstance() {        return new RegexpHTMLLinkExtractor();    }}
上一页 12
💿 文件大小 10016 K
👤 上传用户 qqpp2q
📂 所属分类数值算法/人工智能
🏷️ 相关标签

#Heritrix #robots #META #web
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -