📄 extractorhtml.java

📁 高性能分词算法
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
        final String elementStr = element.toString();        while (attr.find()) {            int valueGroup =                (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;            int start = attr.start(valueGroup);            int end = attr.end(valueGroup);            assert start >= 0: "Start is: " + start + ", " + curi;            assert end >= 0: "End is :" + end + ", " + curi;            CharSequence value = cs.subSequence(start, end);            value = TextUtils.unescapeHtml(value);            if (attr.start(2) > -1) {                // HREF                CharSequence context =                    Link.elementContext(element, attr.group(2));                if(elementStr.equalsIgnoreCase(LINK)) {                    // <LINK> elements treated as embeds (css, ico, etc)                    processEmbed(curi, value, context);                } else {                    // other HREFs treated as links                    processLink(curi, value, context);                }                if (elementStr.equalsIgnoreCase(BASE)) {                    try {                        curi.setBaseURI(value.toString());                    } catch (URIException e) {                        if (getController() != null) {                            // Controller can be null: e.g. when running                            // ExtractorTool.                            getController().logUriError(e, curi.getUURI(),                                value.toString());                        } else {                            logger.info("Failed set base uri: " +                                curi + ", " + value.toString() + ": " +                                e.getMessage());                        }                    }                }            } else if (attr.start(3) > -1) {                // ACTION                if (!ignoreFormActions) {                    action = value;                     actionContext = Link.elementContext(element,                        attr.group(3));                    // handling finished only at end (after METHOD also collected)                }            } else if (attr.start(4) > -1) {                // ON____                processScriptCode(curi, value); // TODO: context?            } else if (attr.start(5) > -1) {                // SRC etc.                CharSequence context = Link.elementContext(element,                    attr.group(5));                                // true, if we expect another HTML page instead of an image etc.                final char hopType;                                if(!framesAsEmbeds                    && (elementStr.equalsIgnoreCase(FRAME) || elementStr                        .equalsIgnoreCase(IFRAME))) {                    hopType = Link.NAVLINK_HOP;                } else {                    hopType = Link.EMBED_HOP;                }                processEmbed(curi, value, context, hopType);            } else if (attr.start(6) > -1) {                // CODEBASE                codebase = (value instanceof String)?                    (String)value: value.toString();                CharSequence context = Link.elementContext(element,                    attr.group(6));                processEmbed(curi, codebase, context);            } else if (attr.start(7) > -1) {                // CLASSID, DATA                if (resources == null) {                    resources = new ArrayList<String>();                }                resources.add(value.toString());            } else if (attr.start(8) > -1) {                // ARCHIVE                if (resources==null) {                    resources = new ArrayList<String>();                }                String[] multi = TextUtils.split(WHITESPACE, value);                for(int i = 0; i < multi.length; i++ ) {                    resources.add(multi[i]);                }            } else if (attr.start(9) > -1) {                // CODE                if (resources==null) {                    resources = new ArrayList<String>();                }                // If element is applet and code value does not end with                // '.class' then append '.class' to the code value.                if (elementStr.equalsIgnoreCase(APPLET) &&                        !value.toString().toLowerCase().endsWith(CLASSEXT)) {                    resources.add(value.toString() + CLASSEXT);                } else {                    resources.add(value.toString());                }            } else if (attr.start(10) > -1) {                // VALUE, with possibility of URI                if (extractValueAttributes                         && TextUtils.matches(LIKELY_URI_PATH, value)) {                    CharSequence context = Link.elementContext(element,                        attr.group(10));                    processLink(curi,value, context);                }            } else if (attr.start(11) > -1) {                // STYLE inline attribute                // then, parse for URIs                this.numberOfLinksExtracted += ExtractorCSS.processStyleCode(                    curi, value, getController());                            } else if (attr.start(12) > -1) {                // METHOD                method = value;                // form processing finished at end (after ACTION also collected)            } else if (attr.start(13) > -1) {                // any other attribute                // ignore for now                // could probe for path- or script-looking strings, but                // those should be vanishingly rare in other attributes,                // and/or symptomatic of page bugs            }        }        TextUtils.recycleMatcher(attr);        // finish handling codebase/resources now that all available        if (resources != null) {            Iterator iter = resources.iterator();            UURI codebaseURI = null;            String res = null;            try {                if (codebase != null) {                    // TODO: Pass in the charset.                    codebaseURI = UURIFactory.                        getInstance(curi.getUURI(), codebase);                }                while(iter.hasNext()) {                    res = iter.next().toString();                    res = (String) TextUtils.unescapeHtml(res);                    if (codebaseURI != null) {                        res = codebaseURI.resolve(res).toString();                    }                    processEmbed(curi, res, element); // TODO: include attribute too                }            } catch (URIException e) {                curi.addLocalizedError(getName(), e, "BAD CODEBASE " + codebase);            } catch (IllegalArgumentException e) {                DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +                    "codebase=" + codebase + " res=" + res + "\n" +                    DevUtils.extraInfo(), e);            }        }                // finish handling form action, now method is available        if(action != null) {            if(method == null || "GET".equalsIgnoreCase(method.toString())                     || ! ((Boolean)getUncheckedAttribute(curi,                            ATTR_EXTRACT_ONLY_FORM_GETS)).booleanValue()) {                processLink(curi, action, actionContext);            }        }    }    /**     * Extract the (java)script source in the given CharSequence.      *      * @param curi source CrawlURI     * @param cs CharSequence of javascript code     */    protected void processScriptCode(CrawlURI curi, CharSequence cs) {        if((Boolean)getUncheckedAttribute(curi, ATTR_EXTRACT_JAVASCRIPT)) {            this.numberOfLinksExtracted +=                ExtractorJS.considerStrings(curi, cs, getController(), false);        } // else do nothing    }    static final String JAVASCRIPT = "(?i)^javascript:.*";    /**     * Handle generic HREF cases.     *      * @param curi     * @param value     * @param context     */    protected void processLink(CrawlURI curi, final CharSequence value,            CharSequence context) {        if (TextUtils.matches(JAVASCRIPT, value)) {            processScriptCode(curi, value. subSequence(11, value.length()));        } else {                if (logger.isLoggable(Level.FINEST)) {                logger.finest("link: " + value.toString() + " from " + curi);            }            addLinkFromString(curi,                (value instanceof String)?                    (String)value: value.toString(),                context, Link.NAVLINK_HOP);            this.numberOfLinksExtracted++;        }    }    private void addLinkFromString(CrawlURI curi, String uri,            CharSequence context, char hopType) {        try {            // We do a 'toString' on context because its a sequence from            // the underlying ReplayCharSequence and the link its about            // to become a part of is expected to outlive the current            // ReplayCharSequence.            curi.createAndAddLinkRelativeToBase(uri, context.toString(),                hopType);        } catch (URIException e) {            if (getController() != null) {                getController().logUriError(e, curi.getUURI(), uri);            } else {                logger.info("Failed createAndAddLinkRelativeToBase " +                    curi + ", " + uri + ", " + context + ", " + hopType +                    ": " + e);            }        }    }    protected final void processEmbed(CrawlURI curi, CharSequence value,            CharSequence context) {        processEmbed(curi, value, context, Link.EMBED_HOP);    }    protected void processEmbed(CrawlURI curi, final CharSequence value,            CharSequence context, char hopType) {        if (logger.isLoggable(Level.FINEST)) {            logger.finest("embed (" + hopType + "): " + value.toString() +                " from " + curi);        }        addLinkFromString(curi,            (value instanceof String)?                (String)value: value.toString(),            context, hopType);        this.numberOfLinksExtracted++;    }    public void extract(CrawlURI curi) {        if (!isHttpTransactionContentToProcess(curi) ||                ! (isExpectedMimeType(curi.getContentType(), "text/html")                   || isExpectedMimeType(curi.getContentType(), "application/xhtml")                   || isExpectedMimeType(curi.getContentType(), "text/vnd.wap.wml")                   || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.wml")                   || isExpectedMimeType(curi.getContentType(), "application/vnd.wap.xhtml"))) {            return;        }        final boolean ignoreUnexpectedHTML =             ((Boolean)getUncheckedAttribute(curi,                  ATTR_IGNORE_UNEXPECTED_HTML)).booleanValue();                if (ignoreUnexpectedHTML) {            try {                if(!isHtmlExpectedHere(curi)) {                    // HTML was not expected (eg a GIF was expected) so ignore                    // (as if a soft 404)                    return;                }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -