📄 extractorhtml.java

📁 最强的爬虫工程
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
                }                processEmbed(curi, res, element); // TODO: include attribute too            }        } catch (URIException e) {            curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);        } catch (IllegalArgumentException e) {            DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +                "codebase=" + codebase + " res=" + res + "\n" +                DevUtils.extraInfo(), e);        }    }    // finds strings in javascript likely to be URIs/paths    // guessing based on '.' in string, so if highly likely to    // get gifs/etc, unable to get many other paths    // will find false positives    // TODO: add '/' check, suppress strings being concatenated via '+'?    static final String JAVASCRIPT_LIKELY_URI_EXTRACTOR =     "(\\\\{0,8}+\"|\\\\{0,8}+\')(\\.{0,2}[^+\\.\\n\\r\\s\"\']+[^\\.\\n\\r\\s\"\']*(\\.[^\\.\\n\\r\\s\"\']+)+)(\\1)";    /**     * @param curi     * @param cs     */    protected void processScriptCode(CrawlURI curi, CharSequence cs) {        this.numberOfLinksExtracted +=            ExtractorJS.considerStrings(curi, cs, getController(), false);    }    static final String JAVASCRIPT = "(?i)^javascript:.*";    /**     * Handle generic HREF cases.     *      * @param curi     * @param value     * @param context     */    protected void processLink(CrawlURI curi, final CharSequence value,            CharSequence context) {        if (TextUtils.matches(JAVASCRIPT, value)) {            processScriptCode(curi, value. subSequence(11, value.length()));        } else {                if (logger.isLoggable(Level.FINEST)) {                logger.finest("link: " + value.toString() + " from " + curi);            }            addLinkFromString(curi,                (value instanceof String)?                    (String)value: value.toString(),                context, Link.NAVLINK_HOP);            this.numberOfLinksExtracted++;        }    }    private void addLinkFromString(CrawlURI curi, String uri,            CharSequence context, char hopType) {        try {            // We do a 'toString' on context because its a sequence from            // the underlying ReplayCharSequence and the link its about            // to become a part of is expected to outlive the current            // ReplayCharSequence.            curi.createAndAddLinkRelativeToBase(uri, context.toString(),                hopType);        } catch (URIException e) {            if (getController() != null) {                getController().logUriError(e, curi.getUURI(), uri);            } else {                logger.info("Failed createAndAddLinkRelativeToBase " +                    curi + ", " + uri + ", " + context + ", " + hopType +                    ": " + e);            }        }    }    protected final void processEmbed(CrawlURI curi, CharSequence value,            CharSequence context) {        processEmbed(curi, value, context, Link.EMBED_HOP);    }    protected void processEmbed(CrawlURI curi, final CharSequence value,            CharSequence context, char hopType) {        if (logger.isLoggable(Level.FINEST)) {            logger.finest("embed (" + hopType + "): " + value.toString() +                " from " + curi);        }        addLinkFromString(curi,            (value instanceof String)?                (String)value: value.toString(),            context, hopType);        this.numberOfLinksExtracted++;    }    public void extract(CrawlURI curi) {        if (!isHttpTransactionContentToProcess(curi) ||                ! (isExpectedMimeType(curi.getContentType(), "text/html")                   || isExpectedMimeType(curi.getContentType(), "application/xhtml"))) {            return;        }        final boolean ignoreUnexpectedHTML =             ((Boolean)getUncheckedAttribute(curi,                  ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();                if (ignoreUnexpectedHTML) {            try {                if(!isHtmlExpectedHere(curi)) {                    // HTML was not expected (eg a GIF was expected) so ignore                    // (as if a soft 404)                    return;                }            } catch (URIException e) {                logger.severe("Failed expectedHTML test: " + e.getMessage());            }        }        this.numberOfCURIsHandled++;        ReplayCharSequence cs = null;                try {           HttpRecorder hr = curi.getHttpRecorder();           if (hr == null) {               throw new IOException("Why is recorder null here?");           }           cs = hr.getReplayCharSequence();        } catch (IOException e) {            curi.addLocalizedError(this.getName(), e,                "Failed get of replay char sequence " + curi.toString() +                    " " + e.getMessage());            logger.log(Level.SEVERE,"Failed get of replay char sequence in " +                Thread.currentThread().getName(), e);        }                if (cs == null) {            return;        }        // We have a ReplayCharSequence open.  Wrap all in finally so we        // for sure close it before we leave.        try {            // Extract all links from the charsequence            extract(curi, cs);            // Set flag to indicate that link extraction is completed.            curi.linkExtractorFinished();        } finally {            if (cs != null) {                try {                    cs.close();                } catch (IOException ioe) {                    logger.warning(TextUtils.exceptionToString(                        "Failed close of ReplayCharSequence.", ioe));                }            }        }    }    /**     * Run extractor.     * This method is package visible to ease testing.     * @param curi CrawlURI we're processing.     * @param cs Sequence from underlying ReplayCharSequence. This     * is TRANSIENT data. Make a copy if you want the data to live outside     * of this extractors' lifetime.     */    void extract(CrawlURI curi, CharSequence cs) {        Matcher tags = TextUtils.getMatcher(RELEVANT_TAG_EXTRACTOR, cs);        while(tags.find()) {            if(Thread.interrupted()){                break;            }            if (tags.start(8) > 0) {                // comment match                // for now do nothing            } else if (tags.start(7) > 0) {                // <meta> match                int start = tags.start(5);                int end = tags.end(5);                assert start >= 0: "Start is: " + start + ", " + curi;                assert end >= 0: "End is :" + end + ", " + curi;                if (processMeta(curi,                    cs.subSequence(start, end))) {                    // meta tag included NOFOLLOW; abort processing                    break;                }            } else if (tags.start(5) > 0) {                // generic <whatever> match                int start5 = tags.start(5);                int end5 = tags.end(5);                assert start5 >= 0: "Start is: " + start5 + ", " + curi;                assert end5 >= 0: "End is :" + end5 + ", " + curi;                int start6 = tags.start(6);                int end6 = tags.end(6);                assert start6 >= 0: "Start is: " + start6 + ", " + curi;                assert end6 >= 0: "End is :" + end6 + ", " + curi;                processGeneralTag(curi,                    cs.subSequence(start6, end6),                    cs.subSequence(start5, end5));            } else if (tags.start(1) > 0) {                // <script> match                int start = tags.start(1);                int end = tags.end(1);                assert start >= 0: "Start is: " + start + ", " + curi;                assert end >= 0: "End is :" + end + ", " + curi;                assert tags.end(2) >= 0: "Tags.end(2) illegal " + tags.end(2) +                    ", " + curi;                processScript(curi, cs.subSequence(start, end),                    tags.end(2) - start);            } else if (tags.start(3) > 0){                // <style... match                int start = tags.start(3);                int end = tags.end(3);                assert start >= 0: "Start is: " + start + ", " + curi;                assert end >= 0: "End is :" + end + ", " + curi;                assert tags.end(4) >= 0: "Tags.end(4) illegal " + tags.end(4) +                    ", " + curi;                processStyle(curi, cs.subSequence(start, end),                    tags.end(4) - start);            }        }        TextUtils.recycleMatcher(tags);    }    static final String NON_HTML_PATH_EXTENSION =        "(?i)(gif)|(jp(e)?g)|(png)|(tif(f)?)|(bmp)|(avi)|(mov)|(mp(e)?g)"+        "|(mp3)|(mp4)|(swf)|(wav)|(au)|(aiff)|(mid)";    /**     * Test whether this HTML is so unexpected (eg in place of a GIF URI)     * that it shouldn't be scanned for links.     *     * @param curi CrawlURI to examine.     * @return True if HTML is acceptable/expected here     * @throws URIException     */    protected boolean isHtmlExpectedHere(CrawlURI curi) throws URIException {        String path = curi.getUURI().getPath();        if(path==null) {            // no path extension, HTML is fine            return true;        }        int dot = path.lastIndexOf('.');        if (dot < 0) {            // no path extension, HTML is fine            return true;        }        if(dot<(path.length()-5)) {            // extension too long to recognize, HTML is fine            return true;        }        String ext = path.substring(dot+1);        return ! TextUtils.matches(NON_HTML_PATH_EXTENSION, ext);    }    protected void processScript(CrawlURI curi, CharSequence sequence,            int endOfOpenTag) {        // for now, do nothing        // TODO: best effort extraction of strings        // first, get attributes of script-open tag        // as per any other tag        processGeneralTag(curi,sequence.subSequence(0,6),            sequence.subSequence(0,endOfOpenTag));        // then, apply best-effort string-analysis heuristics        // against any code present (false positives are OK)        processScriptCode(            curi, sequence.subSequence(endOfOpenTag, sequence.length()));    }    /**     * Process metadata tags.     * @param curi CrawlURI we're processing.     * @param cs Sequence from underlying ReplayCharSequence. This     * is TRANSIENT data. Make a copy if you want the data to live outside     * of this extractors' lifetime.     * @return True robots exclusion metatag.     */    protected boolean processMeta(CrawlURI curi, CharSequence cs) {        Matcher attr = TextUtils.getMatcher(EACH_ATTRIBUTE_EXTRACTOR, cs);        String name = null;        String httpEquiv = null;        String content = null;        while (attr.find()) {            int valueGroup =                (attr.start(13) > -1) ? 13 : (attr.start(14) > -1) ? 14 : 15;            CharSequence value =                cs.subSequence(attr.start(valueGroup), attr.end(valueGroup));            if (attr.group(1).equalsIgnoreCase("name")) {                name = value.toString();            } else if (attr.group(1).equalsIgnoreCase("http-equiv")) {                httpEquiv = value.toString();            } else if (attr.group(1).equalsIgnoreCase("content")) {                content = value.toString();            }            // TODO: handle other stuff        }        TextUtils.recycleMatcher(attr);        // Look for the 'robots' meta-tag        if("robots".equalsIgnoreCase(name) && content != null ) {            curi.putString(A_META_ROBOTS, content);            RobotsHonoringPolicy policy =                getSettingsHandler().getOrder().getRobotsHonoringPolicy();            String contentLower = content.toLowerCase();            if ((policy == null                || (!policy.isType(curi, RobotsHonoringPolicy.IGNORE)                    && !policy.isType(curi, RobotsHonoringPolicy.CUSTOM)))                && (contentLower.indexOf("nofollow") >= 0                    || contentLower.indexOf("none") >= 0)) {                // if 'nofollow' or 'none' is specified and the                // honoring policy is not IGNORE or CUSTOM, end html extraction                logger.fine("HTML extraction skipped due to robots meta-tag for: "                                + curi.toString());                return true;            }        } else if ("refresh".equalsIgnoreCase(httpEquiv) && content != null) {            String refreshUri = content.substring(content.indexOf("=") + 1);            try {                curi.createAndAddLinkRelativeToBase(refreshUri, "meta",                    Link.REFER_HOP);            } catch (URIException e) {                if (getController() != null) {                    getController().logUriError(e, curi.getUURI(), refreshUri);                } else {                    logger.info("Failed createAndAddLinkRelativeToBase " +                        curi + ", " + cs + ", " + refreshUri + ": " + e);                }            }        }        return false;    }    /**     * Process style text.     * @param curi CrawlURI we're processing.     * @param sequence Sequence from underlying ReplayCharSequence. This     * is TRANSIENT data. Make a copy if you want the data to live outside     * of this extractors' lifetime.     * @param endOfOpenTag     */    protected void processStyle(CrawlURI curi, CharSequence sequence,            int endOfOpenTag) {        // First, get attributes of script-open tag as per any other tag.        processGeneralTag(curi, sequence.subSequence(0,6),            sequence.subSequence(0,endOfOpenTag));        // then, parse for URIs        this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(            curi, sequence.subSequence(endOfOpenTag,sequence.length()),                getController());    }        /* (non-Javadoc)     * @see org.archive.crawler.framework.Processor#report()     */    public String report() {        StringBuffer ret = new StringBuffer();        ret.append("Processor: org.archive.crawler.extractor.ExtractorHTML\n");        ret.append("  Function:          Link extraction on HTML documents\n");
        ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");        ret.append("  Links extracted:   " + this.numberOfLinksExtracted +            "\n\n");        return ret.toString();    }}
上一页 12
💿 文件大小 18588 K
👤 上传用户 bonylee_java
📂 所属分类 Jsp/Servlet
🏷️ 相关标签

#工程
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -