📄 experimentalwarcwriterprocessor.java
字号:
NAMED_FIELD_TRUNCATED_VALUE_LEN: curi.isHeaderTruncatedFetch()? NAMED_FIELD_TRUNCATED_VALUE_HEAD: // TODO: Add this to spec. TRUNCATED_VALUE_UNSPECIFIED; headers.addLabelValue(HEADER_KEY_TRUNCATED, value); } rid = writeResponse(w, timestamp, HTTP_RESPONSE_MIMETYPE, baseid, curi, headers); } headers = new ANVLRecord(1); headers.addLabelValue(HEADER_KEY_CONCURRENT_TO, '<' + rid.toString() + '>'); if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_REQUESTS))) { writeRequest(w, timestamp, HTTP_REQUEST_MIMETYPE, baseid, curi, headers); } if(((Boolean)getUncheckedAttribute(curi, ATTR_WRITE_METADATA))) { writeMetadata(w, timestamp, baseid, curi, headers); } } else if (lowerCaseScheme.equals("dns")) { ANVLRecord headers = null; String ip = curi.getString(A_DNS_SERVER_IP_LABEL); if (ip != null && ip.length() > 0) { headers = new ANVLRecord(1); headers.addLabelValue(HEADER_KEY_IP, ip); } writeResponse(w, timestamp, curi.getContentType(), baseid, curi, headers); } else { logger.warning("No handler for scheme " + lowerCaseScheme); } } catch (IOException e) { // Invalidate this file (It gets a '.invalid' suffix). getPool().invalidateFile(writer); // Set the writer to null otherwise the pool accounting // of how many active writers gets skewed if we subsequently // do a returnWriter call on this object in the finally block. writer = null; throw e; } finally { if (writer != null) { setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position)); getPool().returnFile(writer); } } checkBytesWritten(); } protected URI writeRequest(final ExperimentalWARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { final URI uid = qualifyRecordID(baseid, TYPE, REQUEST); ReplayInputStream ris = curi.getHttpRecorder().getRecordedOutput().getReplayInputStream(); try { w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid, namedFields, ris, curi.getHttpRecorder().getRecordedOutput().getSize()); } finally { if (ris != null) { ris.close(); } } return uid; } protected URI writeResponse(final ExperimentalWARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { ReplayInputStream ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream(); try { w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid, namedFields, ris, curi.getHttpRecorder().getRecordedInput().getSize()); } finally { if (ris != null) { ris.close(); } } return baseid; } protected URI writeRevisitDigest(final ExperimentalWARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { long revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin(); revisedLength = revisedLength > 0 ? revisedLength : curi.getHttpRecorder().getRecordedInput().getSize(); namedFields.addLabelValue( HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST); namedFields.addLabelValue( HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LEN); ReplayInputStream ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream(); try { w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid, namedFields, ris, revisedLength); } finally { if (ris != null) { ris.close(); } } return baseid; } protected URI writeRevisitNotModified(final ExperimentalWARCWriter w, final String timestamp, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { namedFields.addLabelValue( HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED); // save just enough context to understand basis of not-modified if(curi.containsKey(A_HTTP_TRANSACTION)) { HttpMethodBase method = (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION); saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG); saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields, HEADER_KEY_LAST_MODIFIED); } // truncate to zero-length (all necessary info is above) namedFields.addLabelValue(HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LEN); ReplayInputStream ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream(); try { w.writeRevisitRecord(curi.toString(), timestamp, null, baseid, namedFields, ris, 0); } finally { if (ris != null) { ris.close(); } } return baseid; } /** * Save a header from the given HTTP operation into the * provider headers under a new name * * @param origName header name to get if present * @param method http operation containing headers */ protected void saveHeader(String origName, HttpMethodBase method, ANVLRecord headers, String newName) { Header header = method.getResponseHeader(origName); if(header!=null) { headers.addLabelValue(newName, header.getValue()); } } protected URI writeMetadata(final ExperimentalWARCWriter w, final String timestamp, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { final URI uid = qualifyRecordID(baseid, TYPE, METADATA); // Get some metadata from the curi. // TODO: Get all curi metadata. // TODO: Use other than ANVL (or rename ANVL as NameValue or use // RFC822 (commons-httpclient?). ANVLRecord r = new ANVLRecord(); if (curi.isSeed()) { r.addLabel("seed"); } else { if (curi.forceFetch()) { r.addLabel("force-fetch"); } r.addLabelValue("via", curi.flattenVia()); r.addLabelValue("pathFromSeed", curi.getPathFromSeed()); if (curi.containsKey(A_SOURCE_TAG)) { r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG)); } } // Add outlinks though they are effectively useless without anchor text. Collection<Link> links = curi.getOutLinks(); if (links != null && links.size() > 0) { for (Link link: links) { r.addLabelValue("outlink", link.toString()); } } // TODO: Other curi fields to write to metadata. // // Credentials // // fetch-began-time: 1154569278774 // fetch-completed-time: 1154569281816 // // Annotations. byte [] b = r.getUTF8Bytes(); w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE, uid, namedFields, new ByteArrayInputStream(b), b.length); return uid; } protected URI getRecordID() throws IOException { URI result; try { result = GeneratorFactory.getFactory().getRecordID(); } catch (URISyntaxException e) { throw new IOException(e.toString()); } return result; } protected URI qualifyRecordID(final URI base, final String key, final String value) throws IOException { URI result; Map<String, String> qualifiers = new HashMap<String, String>(1); qualifiers.put(key, value); try { result = GeneratorFactory.getFactory(). qualifyRecordID(base, qualifiers); } catch (URISyntaxException e) { throw new IOException(e.toString()); } return result; } @Override protected String getFirstrecordStylesheet() { return "/warcinfobody.xsl"; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -