jerichoextractorhtml.java

来自「Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按」· Java 代码 · 共 464 行 · 第 1/2 页
JAVA
464 行
                resources = new ArrayList<String>();            // If element is applet and code value does not end with            // '.class' then append '.class' to the code value.            if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {                resources.add(attrValue + CLASSEXT);            } else {                resources.add(attrValue);            }        }        // VALUE        else if (((attr = attributes.get("value")) != null) &&                 ((attrValue = attr.getValue()) != null)) {            if (TextUtils.matches(LIKELY_URI_PATH, attrValue)                    && overlyEagerLinkDetection) {                CharSequence context = Link.elementContext(elementName, attr                        .getKey());                processLink(curi, attrValue, context);            }        }        // STYLE        else if (((attr = attributes.get("style")) != null) &&                 ((attrValue = attr.getValue()) != null)) {            // STYLE inline attribute            // then, parse for URIs            this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,                    attrValue, getController());        }        // handle codebase/resources        if (resources == null)            return;        Iterator<String> iter = resources.iterator();        UURI codebaseURI = null;        String res = null;        try {            if (codebase != null) {                // TODO: Pass in the charset.                codebaseURI = UURIFactory.getInstance(curi.getUURI(), codebase);            }            while (iter.hasNext()) {                res = iter.next();                res = StringEscapeUtils.unescapeHtml(res);                if (codebaseURI != null) {                    res = codebaseURI.resolve(res).toString();                }                processEmbed(curi, res, element); // TODO: include attribute                                                    // too            }        } catch (URIException e) {            curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);        } catch (IllegalArgumentException e) {            DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n"                    + "codebase=" + codebase + " res=" + res + "\n"                    + DevUtils.extraInfo(), e);        }    }    protected boolean processMeta(CrawlURI curi, Element element) {        String name = element.getAttributeValue("name");        String httpEquiv = element.getAttributeValue("http-equiv");        String content = element.getAttributeValue("content");        if ("robots".equals(name) && content != null) {            curi.putString(A_META_ROBOTS, content);            RobotsHonoringPolicy policy = getSettingsHandler().getOrder()                    .getRobotsHonoringPolicy();            String contentLower = content.toLowerCase();            if ((policy == null || (!policy.isType(curi,                    RobotsHonoringPolicy.IGNORE) && !policy.isType(curi,                    RobotsHonoringPolicy.CUSTOM)))                    && (contentLower.indexOf("nofollow") >= 0 || contentLower                            .indexOf("none") >= 0)) {                // if 'nofollow' or 'none' is specified and the                // honoring policy is not IGNORE or CUSTOM, end html extraction                logger.fine("HTML extraction skipped due to robots meta-tag " +                    "for: " + curi.toString());                return true;            }        }        if ("refresh".equals(httpEquiv) && content != null) {            String refreshUri = content.substring(content.indexOf("=") + 1);            try {                curi.createAndAddLinkRelativeToBase(refreshUri, "meta",                        Link.REFER_HOP);            } catch (URIException e) {                if (getController() != null) {                    getController().logUriError(e, curi.getUURI(), refreshUri);                } else {                    logger.info("Failed createAndAddLinkRelativeToBase " + curi                            + ", " + element.toString() + ", " + refreshUri                            + ": " + e);                }            }        }        return false;    }    protected void processScript(CrawlURI curi, Element element) {        // first, get attributes of script-open tag        // as per any other tag        processGeneralTag(curi, element, element.getAttributes());        // then, apply best-effort string-analysis heuristics        // against any code present (false positives are OK)        processScriptCode(curi, element.getContent());    }    protected void processStyle(CrawlURI curi, Element element) {        // First, get attributes of script-open tag as per any other tag.        processGeneralTag(curi, element, element.getAttributes());        // then, parse for URIs        this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(curi,                element.getContent(), getController());    }    protected void processForm(CrawlURI curi, Element element) {        String action = element.getAttributeValue("action");        String name = element.getAttributeValue("name");        String queryURL = "";        final boolean ignoreFormActions = ((Boolean) getUncheckedAttribute(                curi, ATTR_IGNORE_FORM_ACTION_URLS)).booleanValue();        if (ignoreFormActions)            return;        numberOfFormsProcessed++;        // get all form fields        FormFields formFields = element.findFormFields();        for (Iterator fieldsIter = formFields.iterator(); fieldsIter.hasNext();) {            // for each form field            FormField formField = (FormField) fieldsIter.next();            // for each form control            for (Iterator controlIter = formField.getFormControls().iterator();                controlIter.hasNext();) {                FormControl formControl = (FormControl) controlIter.next();                // get name of control element (and URLEncode it)                String controlName = formControl.getName();                // retrieve list of values - submit needs special handling                Collection controlValues;                if (!(formControl.getFormControlType() ==                        FormControlType.SUBMIT)) {                    controlValues = formControl.getValues();                } else {                    controlValues = formControl.getPredefinedValues();                }                if (controlValues.size() > 0) {                    // for each value set                    for (Iterator valueIter = controlValues.iterator();                            valueIter.hasNext();) {                        String value = (String) valueIter.next();                        queryURL += "&" + controlName + "=" + value;                    }                } else {                    queryURL += "&" + controlName + "=";                }            }        }        // clean up url        if (action == null) {            queryURL = queryURL.replaceFirst("&", "?");        } else {            if (!action.contains("?"))                queryURL = queryURL.replaceFirst("&", "?");            queryURL = action + queryURL;        }        CharSequence context = Link.elementContext(element.getName(),            "name=" + name);        processLink(curi, queryURL, context);    }    /**     * Run extractor. This method is package visible to ease testing.     *      * @param curi     *            CrawlURI we're processing.     * @param cs     *            Sequence from underlying ReplayCharSequence.     */    void extract(CrawlURI curi, CharSequence cs) {        Source source = new Source(cs);        List elements = source.findAllElements(StartTagType.NORMAL);        for (Iterator elementIter = elements.iterator();                elementIter.hasNext();) {            Element element = (Element) elementIter.next();            String elementName = element.getName();            Attributes attributes;            if (elementName.equals(HTMLElementName.META)) {                if (processMeta(curi, element)) {                    // meta tag included NOFOLLOW; abort processing                    break;                }            } else if (elementName.equals(HTMLElementName.SCRIPT)) {                processScript(curi, element);            } else if (elementName.equals(HTMLElementName.STYLE)) {                processStyle(curi, element);            } else if (elementName.equals(HTMLElementName.FORM)) {                processForm(curi, element);            } else if (!(attributes = element.getAttributes()).isEmpty()) {                processGeneralTag(curi, element, attributes);            }        }    }    /*     * (non-Javadoc)     *      * @see org.archive.crawler.framework.Processor#report()     */    public String report() {        StringBuffer ret = new StringBuffer();        ret.append("Processor: org.archive.crawler.extractor.JerichoExtractorHTML\n");        ret.append("  Function:          Link extraction on HTML documents\n");        ret.append("  CrawlURIs handled: " + this.numberOfCURIsHandled + "\n");        ret.append("  Forms processed:   " + this.numberOfFormsProcessed + "\n");        ret.append("  Links extracted:   " + this.numberOfLinksExtracted + "\n\n");        return ret.toString();    }}
jerichoextractorhtml.java - 源码说明

本页面展示了「Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。」中的 jerichoextractorhtml.java 源码文件，采用 Java 编程语言编写，共 464 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与Heritrix相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?