📄 warcwriterprocessor.java
字号:
} finally { if (writer != null) { setTotalBytesWritten(getTotalBytesWritten() + (writer.getPosition() - position)); getPool().returnFile(writer); } } checkBytesWritten(); } protected URI writeRequest(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { final URI uid = qualifyRecordID(baseid, TYPE, REQUEST); ReplayInputStream ris = curi.getHttpRecorder().getRecordedOutput().getReplayInputStream(); try { w.writeRequestRecord(curi.toString(), timestamp, mimetype, uid, namedFields, ris, curi.getHttpRecorder().getRecordedOutput().getSize()); } finally { if (ris != null) { ris.close(); } } return uid; } protected URI writeResponse(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { ReplayInputStream ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream(); try { w.writeResponseRecord(curi.toString(), timestamp, mimetype, baseid, namedFields, ris, curi.getHttpRecorder().getRecordedInput().getSize()); } finally { if (ris != null) { ris.close(); } } return baseid; } protected URI writeResource(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { ReplayInputStream ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream(); try { w.writeResourceRecord(curi.toString(), timestamp, mimetype, baseid, namedFields, ris, curi.getHttpRecorder().getRecordedInput().getSize()); } finally { if (ris != null) { ris.close(); } } return baseid; } protected URI writeRevisitDigest(final WARCWriter w, final String timestamp, final String mimetype, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { long revisedLength = curi.getHttpRecorder().getRecordedInput().getContentBegin(); revisedLength = revisedLength > 0 ? revisedLength : curi.getHttpRecorder().getRecordedInput().getSize(); namedFields.addLabelValue( HEADER_KEY_PROFILE, PROFILE_REVISIT_IDENTICAL_DIGEST); namedFields.addLabelValue( HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH); ReplayInputStream ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream(); try { w.writeRevisitRecord(curi.toString(), timestamp, mimetype, baseid, namedFields, ris, revisedLength); } finally { if (ris != null) { ris.close(); } } curi.addAnnotation("warcRevisit:digest"); return baseid; } protected URI writeRevisitNotModified(final WARCWriter w, final String timestamp, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { namedFields.addLabelValue( HEADER_KEY_PROFILE, PROFILE_REVISIT_NOT_MODIFIED); // save just enough context to understand basis of not-modified if(curi.containsKey(A_HTTP_TRANSACTION)) { HttpMethodBase method = (HttpMethodBase) curi.getObject(A_HTTP_TRANSACTION); saveHeader(A_ETAG_HEADER,method,namedFields,HEADER_KEY_ETAG); saveHeader(A_LAST_MODIFIED_HEADER,method,namedFields, HEADER_KEY_LAST_MODIFIED); } // truncate to zero-length (all necessary info is above) namedFields.addLabelValue(HEADER_KEY_TRUNCATED, NAMED_FIELD_TRUNCATED_VALUE_LENGTH); ReplayInputStream ris = curi.getHttpRecorder().getRecordedInput().getReplayInputStream(); try { w.writeRevisitRecord(curi.toString(), timestamp, null, baseid, namedFields, ris, 0); } finally { if (ris != null) { ris.close(); } } curi.addAnnotation("warcRevisit:notModified"); return baseid; } /** * Save a header from the given HTTP operation into the * provider headers under a new name * * @param origName header name to get if present * @param method http operation containing headers */ protected void saveHeader(String origName, HttpMethodBase method, ANVLRecord headers, String newName) { Header header = method.getResponseHeader(origName); if(header!=null) { headers.addLabelValue(newName, header.getValue()); } } protected URI writeMetadata(final WARCWriter w, final String timestamp, final URI baseid, final CrawlURI curi, final ANVLRecord namedFields) throws IOException { final URI uid = qualifyRecordID(baseid, TYPE, METADATA); // Get some metadata from the curi. // TODO: Get all curi metadata. // TODO: Use other than ANVL (or rename ANVL as NameValue or use // RFC822 (commons-httpclient?). ANVLRecord r = new ANVLRecord(); if (curi.isSeed()) { r.addLabel("seed"); } else { if (curi.forceFetch()) { r.addLabel("force-fetch"); } r.addLabelValue("via", curi.flattenVia()); r.addLabelValue("hopsFromSeed", curi.getPathFromSeed()); if (curi.containsKey(A_SOURCE_TAG)) { r.addLabelValue("sourceTag", curi.getString(A_SOURCE_TAG)); } } long duration = curi.getFetchDuration(); if(duration>-1) { r.addLabelValue("fetchTimeMs", Long.toString(duration)); } // Add outlinks though they are effectively useless without anchor text. Collection<Link> links = curi.getOutLinks(); if (links != null && links.size() > 0) { for (Link link: links) { r.addLabelValue("outlink", link.toString()); } } // TODO: Other curi fields to write to metadata. // // Credentials // // fetch-began-time: 1154569278774 // fetch-completed-time: 1154569281816 // // Annotations. byte [] b = r.getUTF8Bytes(); w.writeMetadataRecord(curi.toString(), timestamp, ANVLRecord.MIMETYPE, uid, namedFields, new ByteArrayInputStream(b), b.length); return uid; } protected URI getRecordID() throws IOException { URI result; try { result = GeneratorFactory.getFactory().getRecordID(); } catch (URISyntaxException e) { throw new IOException(e.toString()); } return result; } protected URI qualifyRecordID(final URI base, final String key, final String value) throws IOException { URI result; Map<String, String> qualifiers = new HashMap<String, String>(1); qualifiers.put(key, value); try { result = GeneratorFactory.getFactory(). qualifyRecordID(base, qualifiers); } catch (URISyntaxException e) { throw new IOException(e.toString()); } return result; } @Override protected String getFirstrecordStylesheet() { return "/warcinfobody.xsl"; } /** * Return relevant values as header-like fields (here ANVLRecord, but * spec-defined "application/warc-fields" type when written). Field * names from from DCMI Terms and the WARC/0.17 specification. * * @see org.archive.crawler.framework.WriterPoolProcessor#getFirstrecordBody(java.io.File) */ @Override protected String getFirstrecordBody(File orderFile) { ANVLRecord record = new ANVLRecord(7); record.addLabelValue("software", "Heritrix/" + Heritrix.getVersion() + " http://crawler.archive.org"); try { InetAddress host = InetAddress.getLocalHost(); record.addLabelValue("ip", host.getHostAddress()); record.addLabelValue("hostname", host.getHostName()); } catch (UnknownHostException e) { logger.log(Level.WARNING,"unable top obtain local crawl engine host",e); } record.addLabelValue("format","WARC File Format 0.17"); record.addLabelValue("conformsTo","http://crawler.archive.org/warc/0.17/WARC0.17ISO.doc"); // Get other values from order.xml try { Document doc = XmlUtils.getDocument(orderFile); addIfNotBlank(record,"operator", XmlUtils.xpathOrNull(doc,"//meta/operator")); addIfNotBlank(record,"publisher", XmlUtils.xpathOrNull(doc,"//meta/organization")); addIfNotBlank(record,"audience", XmlUtils.xpathOrNull(doc,"//meta/audience")); addIfNotBlank(record,"isPartOf", XmlUtils.xpathOrNull(doc,"//meta/name")); String rawDate = XmlUtils.xpathOrNull(doc,"//meta/date"); if(StringUtils.isNotBlank(rawDate)) { Date date; try { date = ArchiveUtils.parse14DigitDate(rawDate); addIfNotBlank(record,"created",ArchiveUtils.getLog14Date(date)); } catch (ParseException e) { logger.log(Level.WARNING,"obtaining warc created date",e); } } addIfNotBlank(record,"description", XmlUtils.xpathOrNull(doc,"//meta/description")); addIfNotBlank(record,"robots", XmlUtils.xpathOrNull(doc, "//newObject[@name='robots-honoring-policy']/string[@name='type']")); addIfNotBlank(record,"http-header-user-agent", XmlUtils.xpathOrNull(doc, "//map[@name='http-headers']/string[@name='user-agent']")); addIfNotBlank(record,"http-header-from", XmlUtils.xpathOrNull(doc, "//map[@name='http-headers']/string[@name='from']")); } catch (IOException e) { logger.log(Level.WARNING,"obtaining warcinfo",e); } // really ugly to return as string, when it may just be merged with // a couple other fields at write time, but changing would require // larger refactoring return record.toString(); } protected void addIfNotBlank(ANVLRecord record, String label, String value) { if(StringUtils.isNotBlank(value)) { record.addLabelValue(label, value); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -