⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 experimentalwarcwriterprocessor.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
                                NAMED_FIELD_TRUNCATED_VALUE_LEN:                                curi.isHeaderTruncatedFetch()?                                    NAMED_FIELD_TRUNCATED_VALUE_HEAD:                            // TODO: Add this to spec.                            TRUNCATED_VALUE_UNSPECIFIED;                        headers.addLabelValue(HEADER_KEY_TRUNCATED, value);                    }                    rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE,                    	baseid, curi, headers);                }                                headers = new ANVLRecord(1);                headers.addLabelValue(HEADER_KEY_CONCURRENT_TO,                    '<' + rid.toString() + '>');                if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) {                    writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE,                            baseid, curi, headers);                }                if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) {                    writeMetadata(w, timestamp, baseid, curi, headers);                }             } else if (lowerCaseScheme.equals("dns")) {                ANVLRecord headers = null;                String ip = curi.getString(A_DNS_SERVER_IP_LABEL);                if (ip != null && ip.length() > 0) {                    headers = new ANVLRecord(1);                    headers.addLabelValue(HEADER_KEY_IP, ip);                }                writeResponse(w, timestamp, curi.getContentType(), baseid,                    curi, headers);            } else {                logger.warning("No handler for scheme " + lowerCaseScheme);            }        } catch (IOException e) {            // Invalidate this file (It gets a '.invalid' suffix).            getPool().invalidateFile(writer);            // Set the writer to null otherwise the pool accounting            // of how many active writers gets skewed if we subsequently            // do a returnWriter call on this object in the finally block.            writer = null;            throw e;        } finally {            if (writer != null) {            	setTotalBytesWritten(getTotalBytesWritten() +            	     (writer.getPosition() - position));                getPool().returnFile(writer);            }        }        checkBytesWritten();    }        protected URI writeRequest(final ExperimentalWARCWriter w,            final String timestamp, final String mimetype,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        final URI uid = qualifyRecordID(baseid, TYPE, REQUEST);        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedOutput().getReplayInputStream();        try {            w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid,                namedFields, ris,                curi.getHttpRecorder().getRecordedOutput().getSize());        } finally {            if (ris != null) {                ris.close();            }        }        return uid;    }        protected URI writeResponse(final ExperimentalWARCWriter w,            final String timestamp, final String mimetype,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();        try {            w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid,                namedFields, ris,                curi.getHttpRecorder().getRecordedInput().getSize());        } finally {            if (ris != null) {                ris.close();            }        }        return baseid;    }        protected URI writeRevisitDigest(final ExperimentalWARCWriter w,            final String timestamp, final String mimetype,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        long revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin();        revisedLength = revisedLength > 0             ? revisedLength             : curi.getHttpRecorder().getRecordedInput().getSize();        namedFields.addLabelValue(        		HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST);        namedFields.addLabelValue(        		HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LEN);        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();        try {            w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid,                namedFields, ris, revisedLength);        } finally {            if (ris != null) {                ris.close();            }        }        return baseid;    }        protected URI writeRevisitNotModified(final ExperimentalWARCWriter w,            final String timestamp,             final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        namedFields.addLabelValue(        		HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED);        // save just enough context to understand basis of not-modified        if(curi.containsKey(A_HTTP_TRANSACTION)) {            HttpMethodBase method =                 (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION);            saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG);            saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields,            		HEADER_KEY_LAST_MODIFIED);        }        // truncate to zero-length (all necessary info is above)        namedFields.addLabelValue(HEADER_KEY_TRUNCATED,            NAMED_FIELD_TRUNCATED_VALUE_LEN);        ReplayInputStream ris =            curi.getHttpRecorder().getRecordedInput().getReplayInputStream();        try {            w.writeRevisitRecord(curi.toString(), timestamp, null, baseid,                namedFields, ris, 0);        } finally {            if (ris !=  null) {                ris.close();            }        }        return baseid;    }        /**     * Save a header from the given HTTP operation into the      * provider headers under a new name     *      * @param origName header name to get if present     * @param method http operation containing headers     */    protected void saveHeader(String origName, HttpMethodBase method,     		ANVLRecord headers, String newName) {        Header header = method.getResponseHeader(origName);        if(header!=null) {            headers.addLabelValue(newName, header.getValue());        }    }	protected URI writeMetadata(final ExperimentalWARCWriter w,            final String timestamp,            final URI baseid, final CrawlURI curi,            final ANVLRecord namedFields)     throws IOException {        final URI uid = qualifyRecordID(baseid, TYPE, METADATA);        // Get some metadata from the curi.        // TODO: Get all curi metadata.        // TODO: Use other than ANVL (or rename ANVL as NameValue or use        // RFC822 (commons-httpclient?).        ANVLRecord r = new ANVLRecord();        if (curi.isSeed()) {            r.addLabel("seed");        } else {        	if (curi.forceFetch()) {        		r.addLabel("force-fetch");        	}            r.addLabelValue("via", curi.flattenVia());            r.addLabelValue("pathFromSeed", curi.getPathFromSeed());            if (curi.containsKey(A_SOURCE_TAG)) {                r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG));            }        }                // Add outlinks though they are effectively useless without anchor text.        Collection<Link> links = curi.getOutLinks();        if (links != null && links.size() > 0) {            for (Link link: links) {                r.addLabelValue("outlink", link.toString());            }        }                // TODO: Other curi fields to write to metadata.        //         // Credentials        //         // fetch-began-time: 1154569278774        // fetch-completed-time: 1154569281816        //        // Annotations.                byte [] b = r.getUTF8Bytes();        w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE,            uid, namedFields, new ByteArrayInputStream(b), b.length);        return uid;    }        protected URI getRecordID() throws IOException {        URI result;        try {            result = GeneratorFactory.getFactory().getRecordID();        } catch (URISyntaxException e) {            throw new IOException(e.toString());        }        return result;    }        protected URI qualifyRecordID(final URI base, final String key,            final String value)    throws IOException {        URI result;        Map<String, String> qualifiers = new HashMap<String, String>(1);        qualifiers.put(key, value);        try {            result = GeneratorFactory.getFactory().                qualifyRecordID(base, qualifiers);        } catch (URISyntaxException e) {            throw new IOException(e.toString());        }        return result;    }          @Override    protected String getFirstrecordStylesheet() {        return "/warcinfobody.xsl";    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -