extractorhtml.java

来自「这是个爬虫和lucece相结合最好了」· Java 代码 · 共 755 行 · 第 1/3 页

JAVA
755
字号
            int start = attr.start(valueGroup);            int end = attr.end(valueGroup);            assert start >= 0: "Start is: " + start + ", " + curi;            assert end >= 0: "End is :" + end + ", " + curi;            CharSequence value = cs.subSequence(start, end);            value = TextUtils.unescapeHtml(value);            if (attr.start(2) > -1) {                // HREF                CharSequence context =                    Link.elementContext(element, attr.group(2));                if(elementStr.equalsIgnoreCase(LINK)) {                    // <LINK> elements treated as embeds (css, ico, etc)                    processEmbed(curi, value, context);                } else {                    // other HREFs treated as links                    processLink(curi, value, context);                }                if (elementStr.equalsIgnoreCase(BASE)) {                    try {                        curi.setBaseURI(value.toString());                    } catch (URIException e) {                        if (getController() != null) {                            // Controller can be null: e.g. when running                            // ExtractorTool.                            getController().logUriError(e, curi.getUURI(),                                value.toString());                        } else {                            logger.info("Failed set base uri: " +                                curi + ", " + value.toString() + ": " +                                e.getMessage());                        }                    }                }            } else if (attr.start(3) > -1) {                // ACTION                if (!ignoreFormActions) {                    CharSequence context = Link.elementContext(element,                        attr.group(3));                    processLink(curi, value, context);                }            } else if (attr.start(4) > -1) {                // ON____                processScriptCode(curi, value); // TODO: context?            } else if (attr.start(5) > -1) {                // SRC etc.                CharSequence context = Link.elementContext(element,                    attr.group(5));                                // true, if we expect another HTML page instead of an image etc.                final char hopType;                                if(!framesAsEmbeds                    && (elementStr.equalsIgnoreCase(FRAME) || elementStr                        .equalsIgnoreCase(IFRAME))) {                    hopType = Link.NAVLINK_HOP;                } else {                    hopType = Link.EMBED_HOP;                }                processEmbed(curi, value, context, hopType);            } else if (attr.start(6) > -1) {                // CODEBASE                codebase = (value instanceof String)?                    (String)value: value.toString();                CharSequence context = Link.elementContext(element,                    attr.group(6));                processEmbed(curi, codebase, context);            } else if (attr.start(7) > -1) {                // CLASSID, DATA                if (resources == null) {                    resources = new ArrayList<String>();                }                resources.add(value.toString());            } else if (attr.start(8) > -1) {                // ARCHIVE                if (resources==null) {                    resources = new ArrayList<String>();                }                String[] multi = TextUtils.split(WHITESPACE, value);                for(int i = 0; i < multi.length; i++ ) {                    resources.add(multi[i]);                }            } else if (attr.start(9) > -1) {                // CODE                if (resources==null) {                    resources = new ArrayList<String>();                }                // If element is applet and code value does not end with                // '.class' then append '.class' to the code value.                if (elementStr.equalsIgnoreCase(APPLET) &&                        !value.toString().toLowerCase().endsWith(CLASSEXT)) {                    resources.add(value.toString() + CLASSEXT);                } else {                    resources.add(value.toString());                }            } else if (attr.start(10) > -1) {                // VALUE, with possibility of URI                if (overlyEagerLinkDetection                         && TextUtils.matches(LIKELY_URI_PATH, value)) {                    CharSequence context = Link.elementContext(element,                        attr.group(10));                    processLink(curi,value, context);                }            } else if (attr.start(11) > -1) {                // STYLE inline attribute                // then, parse for URIs                this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(                    curi, value, getController());                            } else if (attr.start(12) > -1) {                // any other attribute                // ignore for now                // could probe for path- or script-looking strings, but                // those should be vanishingly rare in other attributes,                // and/or symptomatic of page bugs            }        }        TextUtils.recycleMatcher(attr);        // handle codebase/resources        if (resources == null) {            return;        }        Iterator iter = resources.iterator();        UURI codebaseURI = null;        String res = null;        try {            if (codebase != null) {                // TODO: Pass in the charset.                codebaseURI = UURIFactory.                    getInstance(curi.getUURI(), codebase);            }            while(iter.hasNext()) {                res = iter.next().toString();                res = (String) TextUtils.unescapeHtml(res);                if (codebaseURI != null) {                    res = codebaseURI.resolve(res).toString();                }                processEmbed(curi, res, element); // TODO: include attribute too            }        } catch (URIException e) {            curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);        } catch (IllegalArgumentException e) {            DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +                "codebase=" + codebase + " res=" + res + "\n" +                DevUtils.extraInfo(), e);        }    }    /**     * Extract the (java)script source in the given CharSequence.      *      * @param curi source CrawlURI     * @param cs CharSequence of javascript code     */    protected void processScriptCode(CrawlURI curi, CharSequence cs) {        if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {            this.numberOfLinksExtracted +=                ExtractorJS.considerStrings(curi, cs, getController(), false);        } // else do nothing    }    static final String JAVASCRIPT = "(?i)^javascript:.*";    /**     * Handle generic HREF cases.     *      * @param curi     * @param value     * @param context     */    protected void processLink(CrawlURI curi, final CharSequence value,            CharSequence context) {        if (TextUtils.matches(JAVASCRIPT, value)) {            processScriptCode(curi, value. subSequence(11, value.length()));        } else {                if (logger.isLoggable(Level.FINEST)) {                logger.finest("link: " + value.toString() + " from " + curi);            }            addLinkFromString(curi,                (value instanceof String)?                    (String)value: value.toString(),                context, Link.NAVLINK_HOP);            this.numberOfLinksExtracted++;        }    }    private void addLinkFromString(CrawlURI curi, String uri,            CharSequence context, char hopType) {        try {            // We do a 'toString' on context because its a sequence from            // the underlying ReplayCharSequence and the link its about            // to become a part of is expected to outlive the current            // ReplayCharSequence.            curi.createAndAddLinkRelativeToBase(uri, context.toString(),                hopType);        } catch (URIException e) {            if (getController() != null) {                getController().logUriError(e, curi.getUURI(), uri);            } else {                logger.info("Failed createAndAddLinkRelativeToBase " +                    curi + ", " + uri + ", " + context + ", " + hopType +                    ": " + e);            }        }    }    protected final void processEmbed(CrawlURI curi, CharSequence value,            CharSequence context) {        processEmbed(curi, value, context, Link.EMBED_HOP);    }    protected void processEmbed(CrawlURI curi, final CharSequence value,            CharSequence context, char hopType) {        if (logger.isLoggable(Level.FINEST)) {            logger.finest("embed (" + hopType + "): " + value.toString() +                " from " + curi);        }        addLinkFromString(curi,            (value instanceof String)?                (String)value: value.toString(),            context, hopType);        this.numberOfLinksExtracted++;    }    public void extract(CrawlURI curi) {        if (!isHttpTransactionContentToProcess(curi) ||                ! (isExpectedMimeType(curi.getContentType(), "text/html")                   || isExpectedMimeType(curi.getContentType(), "application/xhtml"))) {            return;        }        final boolean ignoreUnexpectedHTML =             ((Boolean)getUncheckedAttribute(curi,                  ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();                if (ignoreUnexpectedHTML) {            try {                if(!isHtmlExpectedHere(curi)) {                    // HTML was not expected (eg a GIF was expected) so ignore                    // (as if a soft 404)                    return;                }            } catch (URIException e) {                logger.severe("Failed expectedHTML test: " + e.getMessage());            }        }        this.numberOfCURIsHandled++;        ReplayCharSequence cs = null;        

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?