📄 kw3writerprocessor.java
字号:
} } /* * The actual writing of the Kulturarw3 MIME-file. * * The MIME-file consists of three parts: * 1. ArchiveInfo - Metadata about the file and its content. * 2. Header - The HTTP response header. * 3. Content - The HTTP response content, plus content-type. * * For more on this format, see '?'. */ protected void writeMimeFile(CrawlURI curi) throws IOException { ReplayInputStream ris = null; OutputStream out = null; try { String boundary = BOUNDARY_START + stringToMD5(curi.toString()); ris = curi.getHttpRecorder().getRecordedInput(). getReplayInputStream(); out = initOutputStream(curi); // Part 1: Archive info writeArchiveInfoPart(boundary, curi, ris, out); // Part 2: Header info + HTTP header writeHeaderPart(boundary, ris, out); // Part 3: Content info + HTTP content writeContentPart(boundary, curi, ris, out); // And finally the terminator string String terminator = "\n--" + boundary + "--\n"; out.write(terminator.getBytes()); } finally { if (ris != null) ris.close(); if (out != null) out.close(); } } /* * Get the OutputStream for the file to write to. * * It has a path consisting of: * 1. A dir named with the first two chars of the website's md5. * 2. A dir named after the website. * 3. 'current' - a dir indicating that this is the directory being written * to by the ongoing crawl. * 4. A file on the format <md5 of url>.<fetchtime in seconds> * * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837' */ protected OutputStream initOutputStream(CrawlURI curi) throws IOException { String uri = curi.toString(); int port = curi.getUURI().getPort(); String host = (port == 80 || port <= 0) ? curi.getUURI().getHost() : curi.getUURI().getHost() + ":" + port; long fetchTime = curi.getLong(A_FETCH_BEGAN_TIME) / 1000; String md5 = stringToMD5(host); File dir = new File(this.arcsDir, md5.substring(0, 2) + "/" + host + "/current"); if (!dir.exists()) { dir.mkdirs(); if (this.chmod) chmods(dir, this.arcsDir); } md5 = stringToMD5(uri); File arcFile = new File(dir, md5 + "." + fetchTime); return new FastBufferedOutputStream(new FileOutputStream(arcFile)); } protected void writeArchiveInfoPart(String boundary, CrawlURI curi, ReplayInputStream ris, OutputStream out) throws IOException { // Get things we need to write in this part String uri = curi.toString(); String ip = getHostAddress(curi); long headerLength = ris.getHeaderSize(); long contentLength = ris.getContentSize(); long archiveTime = System.currentTimeMillis() / 1000; // Fetchtime in seconds int statusCode = curi.getFetchStatus(); String headerMd5 = null; Object contentMd5 = null; // Get headerMd5 ByteArrayOutputStream baos = new ByteArrayOutputStream(); ris.readHeaderTo(baos); headerMd5 = stringToMD5(baos.toString()); // Get contentMd5 contentMd5 = curi.getContentDigest(); if (contentMd5 != null) contentMd5 = getHexString((byte[]) contentMd5); StringBuffer buffer = new StringBuffer(); buffer.append("MIME-version: 1.1" + LF); buffer.append("Content-Type: multipart/mixed; boundary=" + boundary + LF); buffer.append("HTTP-Part: ArchiveInfo" + LF); buffer.append(COLLECTION_KEY + COLON + WS + this.collection + LF); buffer.append(HARVESTER_KEY + COLON + WS + this.harvester + LF); buffer.append(URL_KEY + COLON + WS + uri + LF); buffer.append(IP_ADDRESS_KEY + COLON + WS + ip + LF); buffer.append(HEADER_LENGTH_KEY + COLON + WS + headerLength + LF); buffer.append(HEADER_MD5_KEY + COLON + WS + headerMd5 + LF); buffer.append(CONTENT_LENGTH_KEY + COLON + WS + contentLength + LF); buffer.append(CONTENT_MD5_KEY + COLON + WS + contentMd5 + LF); buffer.append(ARCHIVE_TIME_KEY + COLON + WS+ archiveTime + LF); buffer.append(STATUS_CODE_KEY + COLON + WS + statusCode + LF + LF); out.write(buffer.toString().getBytes()); } protected void writeHeaderPart(String boundary, ReplayInputStream ris, OutputStream out) throws IOException { StringBuffer buffer = new StringBuffer(); buffer.append("--" + boundary + LF); buffer.append("Content-Type: text/plain; charset=\"US-ascii\"" + LF); buffer.append("HTTP-Part: Header" + LF + LF ); out.write(buffer.toString().getBytes()); ris.readHeaderTo(out); } protected void writeContentPart(String boundary, CrawlURI curi, ReplayInputStream ris, OutputStream out) throws IOException { // Get things we need to write in this part String uri = curi.toString(); String contentType = curi.getContentType(); long contentLength = ris.getContentSize(); // Only write content if there is some if (contentLength == 0) return; StringBuffer buffer = new StringBuffer(); buffer.append("--" + boundary + LF); buffer.append("Content-Type: " + contentType + LF); buffer.append("HTTP-Part: Content" + LF + LF); out.write(buffer.toString().getBytes()); if (contentLength > this.maxSize) { ris.readContentTo(out, this.maxSize); logger.info(" Truncated url: " + uri + ", Size: " + contentLength + ", Content-type: " + contentType); } else { ris.readContentTo(out); } } // --- Private helper functions --- // /* * Get a MD5 checksum based on a String. */ private String stringToMD5(String str) { try { byte b[] = str.getBytes(); MessageDigest md = MessageDigest.getInstance("MD5"); md.update(b); byte[] digest = md.digest(); return getHexString(digest); } catch (NoSuchAlgorithmException e) { logger.log(Level.WARNING, "md5 error", e); } return null; } /* * Fast convert a byte array to a hex string with possible leading zero. */ private String getHexString(byte[] b) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < b.length; i++) { String tmp = Integer.toHexString(b[i] & 0xff); if (tmp.length() < 2) sb.append("0" + tmp); else sb.append(tmp); } return sb.toString(); } /* * Chmods for all newly created directories. */ private void chmods(File dir, File arcsDir) { String topdir = arcsDir.getAbsolutePath(); chmod(dir, this.chmodValue); File parent = dir.getParentFile(); while (!parent.getAbsolutePath().equalsIgnoreCase((topdir))) { chmod(parent, this.chmodValue); parent = parent.getParentFile(); } } /* * Chmod for a specific file or directory. */ private void chmod(File file, String permissions) { Process proc = null; try { proc = Runtime.getRuntime().exec("chmod " + permissions + " " + file.getAbsolutePath()); proc.waitFor(); proc.getInputStream().close(); proc.getOutputStream().close(); proc.getErrorStream().close(); } catch (IOException e) { logger.log(Level.WARNING, "chmod failed", e); } catch (InterruptedException e) { logger.log(Level.WARNING, "chmod failed", e); } } private String getHostAddress(CrawlURI curi) { CrawlHost h = getController().getServerCache().getHostFor(curi); if (h == null) { throw new NullPointerException("Crawlhost is null for " + curi + " " + curi.getVia()); } InetAddress a = h.getIP(); if (a == null) { throw new NullPointerException("Address is null for " + curi + " " + curi.getVia() + ". Address " + ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ? "was never looked up." : (System.currentTimeMillis() - h.getIpFetched()) + " ms ago.")); } return h.getIP().getHostAddress(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -