⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fetchhttp.java

📁 这是个爬虫和lucece相结合最好了
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
            "Disable cookie-handling.", DEFAULT_IGNORE_COOKIES));        e.setOverrideable(true);        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_BDB_COOKIES,                "Store cookies in BDB-backed map.", DEFAULT_BDB_COOKIES));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_LOAD_COOKIES,            "File to preload cookies from", ""));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SAVE_COOKIES,            "When crawl finishes save cookies to this file", ""));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_TRUST,            "SSL certificate trust level.  Range is from the default 'open'"            + " (trust all certs including expired, selfsigned, and those for"            + " which we do not have a CA) through 'loose' (trust all valid"            + " certificates including selfsigned), 'normal' (all valid"            + " certificates not including selfsigned) to 'strict' (Cert is"            + " valid and DN must match servername)",            ConfigurableX509TrustManager.DEFAULT,            ConfigurableX509TrustManager.LEVELS_AS_ARRAY));        e.setOverrideable(false);        e.setExpertSetting(true);        e = addElementToDefinition(new StringList(ATTR_ACCEPT_HEADERS,            "Accept Headers to include in each request. Each must be the"            + " complete header, e.g., 'Accept-Language: en'"));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_HOST,            "Proxy host IP (set only if needed).", ""));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_HTTP_PROXY_PORT,            "Proxy port (set only if needed)", ""));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_DEFAULT_ENCODING,            "The character encoding to use for files that do not have one" +            " specified in the HTTP response headers.  Default: " +            DEFAULT_CONTENT_CHARSET + ".",            DEFAULT_CONTENT_CHARSET));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_DIGEST_CONTENT,                "Whether or not to perform an on-the-fly digest hash of" +                " retrieved content-bodies.",                DEFAULT_DIGEST_CONTENT));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_DIGEST_ALGORITHM,                "Which algorithm (for example MD5 or SHA-1) to use to perform an on-the-fly digest" +                " hash of retrieved content-bodies.",                DEFAULT_DIGEST_ALGORITHM, DIGEST_ALGORITHMS));        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_MODIFIED_SINCE,                "Send 'If-Modified-Since' header, if previous 'Last-Modified' " +                "fetch history information is available in URI history.",                 new Boolean(true)));        e.setOverrideable(true);        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SEND_IF_NONE_MATCH,                "Send 'If-None-Match' header, if previous 'Etag' fetch " +                "history information is available in URI history.",                 new Boolean(true)));        e.setOverrideable(true);        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SEND_CONNECTION_CLOSE,            "Send 'Connection: close' header with every request.",             new Boolean(true)));        e.setOverrideable(true);        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SEND_REFERER,             "Send 'Referer' header with every request.\n" +             "The 'Referer' header contans the location the crawler came " +             " from, " +             "the page the current URI was discovered in. The 'Referer' " +             "usually is " +             "logged on the remote server and can be of assistance to " +             "webmasters trying to figure how a crawler got to a " +             "particular area on a site.",             new Boolean(true)));        e.setOverrideable(true);        e.setExpertSetting(true);        e = addElementToDefinition(new SimpleType(ATTR_SEND_RANGE,              "Send 'Range' header when a limit (" + ATTR_MAX_LENGTH_BYTES +              ") on document size.\n" +              "Be polite to the HTTP servers and send the 'Range' header," +              "stating that you are only interested in the first n bytes. " +              "Only pertinent if " + ATTR_MAX_LENGTH_BYTES + " > 0. " +              "Sending the 'Range' header results in a " +              "'206 Partial Content' status response, which is better than " +              "just cutting the response mid-download. On rare occasion, " +              " sending 'Range' will " +              "generate '416 Request Range Not Satisfiable' response.",              new Boolean(false)));           e.setOverrideable(true);           e.setExpertSetting(true);           e = addElementToDefinition(new SimpleType(ATTR_LOCAL_ADDRESS,               "Local IP address or hostname to use when making connections " +               "(binding sockets). When not specified, uses default local" +               "address(es).", ""));           e.setExpertSetting(true);    }    protected void innerProcess(final CrawlURI curi)    throws InterruptedException {        if (!canFetch(curi)) {            // Cannot fetch this, due to protocol, retries, or other problems            return;        }        this.curisHandled++;        // Note begin time        curi.putLong(A_FETCH_BEGAN_TIME, System.currentTimeMillis());        // Get a reference to the HttpRecorder that is set into this ToeThread.        HttpRecorder rec = HttpRecorder.getHttpRecorder();                // Shall we get a digest on the content downloaded?        boolean digestContent  = ((Boolean)getUncheckedAttribute(curi,                ATTR_DIGEST_CONTENT)).booleanValue();        String algorithm = null;        if (digestContent) {            algorithm = ((String)getUncheckedAttribute(curi,                ATTR_DIGEST_ALGORITHM));            rec.getRecordedInput().setDigest(algorithm);        } else {            // clear            rec.getRecordedInput().setDigest((MessageDigest)null);        }                        // Below we do two inner classes that add check of midfetch        // filters just as we're about to receive the response body.        String curiString = curi.getUURI().toString();        HttpMethodBase method = null;        if (curi.isPost()) {            method = new HttpRecorderPostMethod(curiString, rec) {                protected void readResponseBody(HttpState state,                        HttpConnection conn)                throws IOException, HttpException {                    addResponseContent(this, curi);                    if (checkMidfetchAbort(curi, this.httpRecorderMethod, conn)) {                        doAbort(curi, this, MIDFETCH_ABORT_LOG);                    } else {                        super.readResponseBody(state, conn);                    }                }            };        } else {            method = new HttpRecorderGetMethod(curiString, rec) {                protected void readResponseBody(HttpState state,                        HttpConnection conn)                throws IOException, HttpException {                    addResponseContent(this, curi);                    if (checkMidfetchAbort(curi, this.httpRecorderMethod,                            conn)) {                        doAbort(curi, this, MIDFETCH_ABORT_LOG);                    } else {                        super.readResponseBody(state, conn);                    }                }            };        }        HostConfiguration customConfigOrNull = configureMethod(curi, method);                // Set httpRecorder into curi. Subsequent code both here and later        // in extractors expects to find the HttpRecorder in the CrawlURI.        curi.setHttpRecorder(rec);                // Populate credentials. Set config so auth. is not automatic.        boolean addedCredentials = populateCredentials(curi, method);        method.setDoAuthentication(addedCredentials);                // set hardMax on bytes (if set by operator)        long hardMax = getMaxLength(curi);        // set overall timeout (if set by operator)        long timeoutMs = 1000 * getTimeout(curi);        // Get max fetch rate (bytes/ms). It comes in in KB/sec        long maxRateKBps = getMaxFetchRate(curi);        rec.getRecordedInput().setLimits(hardMax, timeoutMs, maxRateKBps);                try {            this.http.executeMethod(customConfigOrNull, method);        } catch (RecorderTooMuchHeaderException ex) {            // when too much header material, abort like other truncations            doAbort(curi, method, HEADER_TRUNC);        } catch (IOException e) {        	failedExecuteCleanup(method, curi, e);        	return;        } catch (ArrayIndexOutOfBoundsException e) {            // For weird windows-only ArrayIndex exceptions in native            // code... see            // http://forum.java.sun.com/thread.jsp?forum=11&thread=378356            // treating as if it were an IOException            failedExecuteCleanup(method, curi, e);            return;        }                // set softMax on bytes to get (if implied by content-length)         long softMax = method.getResponseContentLength();        try {            if (!method.isAborted()) {                // Force read-to-end, so that any socket hangs occur here,                // not in later modules.                rec.getRecordedInput().readFullyOrUntil(softMax);            }        } catch (RecorderTimeoutException ex) {            doAbort(curi, method, TIMER_TRUNC);        } catch (RecorderLengthExceededException ex) {            doAbort(curi, method, LENGTH_TRUNC);        } catch (IOException e) {            cleanup(curi, e, "readFully", S_CONNECT_LOST);            return;        } catch (ArrayIndexOutOfBoundsException e) {            // For weird windows-only ArrayIndex exceptions from native code            // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356            // treating as if it were an IOException            cleanup(curi, e, "readFully", S_CONNECT_LOST);            return;        } finally {            // ensure recording has stopped            rec.closeRecorders();            if (!method.isAborted()) {                method.releaseConnection();            }            // Note completion time            curi.putLong(A_FETCH_COMPLETED_TIME, System.currentTimeMillis());            // Set the response charset into the HttpRecord if available.            setCharacterEncoding(rec, method);            setSizes(curi, rec);        }         if (digestContent) {            curi.setContentDigest(algorithm,                rec.getRecordedInput().getDigestValue());        }        if (logger.isLoggable(Level.INFO)) {            logger.info((curi.isPost()? "POST": "GET") + " " +                curi.getUURI().toString() + " " + method.getStatusCode() +                " " + rec.getRecordedInput().getSize() + " " +                curi.getContentType());        }        if (curi.isSuccess() && addedCredentials) {            // Promote the credentials from the CrawlURI to the CrawlServer            // so they are available for all subsequent CrawlURIs on this            // server.            promoteCredentials(curi);            if (logger.isLoggable(Level.FINE)) {                // Print out the cookie.  Might help with the debugging.                Header setCookie = method.getResponseHeader("set-cookie");                if (setCookie != null) {                    logger.fine(setCookie.toString().trim());                }            }        } else if (method.getStatusCode() == HttpStatus.SC_UNAUTHORIZED) {            // 401 is not 'success'.            handle401(method, curi);        }                if (rec.getRecordedInput().isOpen()) {            logger.severe(curi.toString() + " RIS still open. Should have" +                " been closed by method release: " +                Thread.currentThread().getName());            try {                rec.getRecordedInput().close();            } catch (IOException e) {                logger.log(Level.SEVERE,"second-chance RIS close failed",e);            }        }    }    /**     * Update CrawlURI internal sizes based on current transaction (and     * in the case of 304s, history)      *      * @param curi CrawlURI     * @param rec HttpRecorder     */    protected void setSizes(final CrawlURI curi, HttpRecorder rec) {        // set reporting size        curi.setContentSize(rec.getRecordedInput().getSize());        // special handling for 304-not modified        if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED                && curi.containsKey(A_FETCH_HISTORY)) {            AList history[] = curi.getAList().getAListArray(A_FETCH_HISTORY);            if (history[0] != null                    && history[0]                            .containsKey(CoreAttributeConstants.A_REFERENCE_LENGTH)) {                long referenceLength = history[0].getLong(A_REFERENCE_LENGTH);                // carry-forward previous 'reference-length' for future                curi.putLong(A_REFERENCE_LENGTH, referenceLength);                // increase content-size to virtual-size for reporting                curi.setContentSize(rec.getRecordedInput().getSize()                        + referenceLength);            }        }    }        protected void doAbort(CrawlURI curi, HttpMethod method,            String annotation) {        curi.addAnnotation(annotation);        curi.getHttpRecorder().close();        method.abort();    }        protected boolean checkMidfetchAbort(CrawlURI curi,            HttpRecorderMethod method, HttpConnection conn) {        if (curi.isPrerequisite() || rulesAccept(getMidfetchRule(curi), curi)) {            return false;        }        method.markContentBegin(conn);        return true;    }        protected DecideRule getMidfetchRule(Object o) {        try {            return (DecideRule)getAttribute(o, ATTR_MIDFETCH_DECIDE_RULES);        } catch (AttributeNotFoundException e) {            throw new RuntimeException(e);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -