⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 kw3writerprocessor.java

📁 Heritrix是一个开源,可扩展的web爬虫项目。Heritrix设计成严格按照robots.txt文件的排除指示和META robots标签。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
      }        }    /*   * The actual writing of the Kulturarw3 MIME-file.   *    * The MIME-file consists of three parts:   * 1. ArchiveInfo - Metadata about the file and its content.   * 2. Header - The HTTP response header.   * 3. Content - The HTTP response content, plus content-type.   *    * For more on this format, see '?'.   */  protected void writeMimeFile(CrawlURI curi) throws IOException {      ReplayInputStream ris = null;      OutputStream out = null;                      try {          String boundary = BOUNDARY_START + stringToMD5(curi.toString());          ris = curi.getHttpRecorder().getRecordedInput().              getReplayInputStream();          out = initOutputStream(curi);                    // Part 1: Archive info          writeArchiveInfoPart(boundary, curi, ris, out);          // Part 2: Header info + HTTP header          writeHeaderPart(boundary, ris, out);          // Part 3: Content info + HTTP content          writeContentPart(boundary, curi, ris, out);          // And finally the terminator string          String terminator = "\n--" + boundary + "--\n";          out.write(terminator.getBytes());      } finally {          if (ris != null)              ris.close();          if (out != null)              out.close();      }  }    /*   * Get the OutputStream for the file to write to.   *    * It has a path consisting of:   * 1. A dir named with the first two chars of the website's md5.   * 2. A dir named after the website.   * 3. 'current' - a dir indicating that this is the directory being written   *                to by the ongoing crawl.    * 4. A file on the format <md5 of url>.<fetchtime in seconds>   *    * Example: '/53/www.kb.se/current/6879ad79c0ccf886ee8ca55d80e5d6a1.1169211837'               */  protected OutputStream initOutputStream(CrawlURI curi) throws IOException {      String uri = curi.toString();      int port = curi.getUURI().getPort();      String host = (port == 80 || port <= 0) ?              curi.getUURI().getHost() : curi.getUURI().getHost() + ":" + port;      long fetchTime = curi.getLong(A_FETCH_BEGAN_TIME) / 1000;                   String md5 = stringToMD5(host);      File dir = new File(this.arcsDir, md5.substring(0, 2) + "/" + host +              "/current");      if (!dir.exists()) {          dir.mkdirs();          if (this.chmod)              chmods(dir, this.arcsDir);      }      md5 = stringToMD5(uri);      File arcFile = new File(dir, md5 + "." + fetchTime);      return new FastBufferedOutputStream(new FileOutputStream(arcFile));         }    protected void writeArchiveInfoPart(String boundary, CrawlURI curi,          ReplayInputStream ris, OutputStream out)          throws IOException {      // Get things we need to write in this part      String uri = curi.toString();      String ip = getHostAddress(curi);      long headerLength = ris.getHeaderSize();      long contentLength = ris.getContentSize();      long archiveTime = System.currentTimeMillis() / 1000; // Fetchtime in seconds      int statusCode = curi.getFetchStatus();      String headerMd5 = null;      Object contentMd5 = null;                   // Get headerMd5      ByteArrayOutputStream baos = new ByteArrayOutputStream();      ris.readHeaderTo(baos);      headerMd5 = stringToMD5(baos.toString());                          // Get contentMd5      contentMd5 = curi.getContentDigest();      if (contentMd5 != null)          contentMd5 = getHexString((byte[]) contentMd5);            StringBuffer buffer = new StringBuffer();      buffer.append("MIME-version: 1.1" + LF);      buffer.append("Content-Type: multipart/mixed; boundary=" + boundary + LF);      buffer.append("HTTP-Part: ArchiveInfo" + LF);      buffer.append(COLLECTION_KEY + COLON + WS + this.collection + LF);      buffer.append(HARVESTER_KEY + COLON + WS + this.harvester + LF);      buffer.append(URL_KEY + COLON + WS + uri + LF);      buffer.append(IP_ADDRESS_KEY + COLON + WS + ip + LF);      buffer.append(HEADER_LENGTH_KEY + COLON + WS + headerLength + LF);      buffer.append(HEADER_MD5_KEY + COLON + WS + headerMd5 + LF);      buffer.append(CONTENT_LENGTH_KEY + COLON + WS + contentLength + LF);      buffer.append(CONTENT_MD5_KEY + COLON + WS + contentMd5 + LF);      buffer.append(ARCHIVE_TIME_KEY + COLON + WS+ archiveTime + LF);      buffer.append(STATUS_CODE_KEY + COLON + WS + statusCode + LF + LF);             out.write(buffer.toString().getBytes());         }    protected void writeHeaderPart(String boundary, ReplayInputStream ris,          OutputStream out)           throws IOException {      StringBuffer buffer = new StringBuffer();      buffer.append("--" + boundary + LF);      buffer.append("Content-Type: text/plain; charset=\"US-ascii\"" + LF);      buffer.append("HTTP-Part: Header" + LF + LF );      out.write(buffer.toString().getBytes());      ris.readHeaderTo(out);         }    protected void writeContentPart(String boundary, CrawlURI curi,          ReplayInputStream ris, OutputStream out)           throws IOException {      // Get things we need to write in this part      String uri = curi.toString();      String contentType = curi.getContentType();      long contentLength = ris.getContentSize();            // Only write content if there is some      if (contentLength == 0)   return;                   StringBuffer buffer = new StringBuffer();      buffer.append("--" + boundary + LF);      buffer.append("Content-Type: " + contentType + LF);      buffer.append("HTTP-Part: Content" + LF + LF);      out.write(buffer.toString().getBytes());            if (contentLength > this.maxSize) {          ris.readContentTo(out, this.maxSize);          logger.info(" Truncated url: " + uri + ", Size: " + contentLength +                  ", Content-type: " + contentType);      } else {          ris.readContentTo(out);      }  }  // --- Private helper functions --- //  /*   * Get a MD5 checksum based on a String.    */   private String stringToMD5(String str) {      try {          byte b[] = str.getBytes();          MessageDigest md = MessageDigest.getInstance("MD5");          md.update(b);          byte[] digest = md.digest();          return getHexString(digest);      } catch (NoSuchAlgorithmException e) {          logger.log(Level.WARNING, "md5 error", e);      }       return null;  }  /*    * Fast convert a byte array to a hex string with possible leading zero.   */  private String getHexString(byte[] b) {      StringBuffer sb = new StringBuffer();      for (int i = 0; i < b.length; i++) {          String tmp = Integer.toHexString(b[i] & 0xff);          if (tmp.length() < 2)              sb.append("0" + tmp);          else              sb.append(tmp);      }      return sb.toString();  }  /*    * Chmods for all newly created directories.   */  private void chmods(File dir, File arcsDir) {      String topdir = arcsDir.getAbsolutePath();      chmod(dir, this.chmodValue);      File parent = dir.getParentFile();      while (!parent.getAbsolutePath().equalsIgnoreCase((topdir))) {          chmod(parent, this.chmodValue);          parent = parent.getParentFile();      }        }  /*    * Chmod for a specific file or directory.   */  private void chmod(File file, String permissions) {      Process proc = null;      try {          proc = Runtime.getRuntime().exec("chmod " + permissions + " " +                  file.getAbsolutePath());          proc.waitFor();          proc.getInputStream().close();          proc.getOutputStream().close();          proc.getErrorStream().close();      } catch (IOException e) {          logger.log(Level.WARNING, "chmod failed", e);      } catch (InterruptedException e) {          logger.log(Level.WARNING, "chmod failed", e);      }  }  private String getHostAddress(CrawlURI curi) {      CrawlHost h = getController().getServerCache().getHostFor(curi);      if (h == null) {          throw new NullPointerException("Crawlhost is null for " + curi + " " +                  curi.getVia());      }      InetAddress a = h.getIP();      if (a == null) {          throw new NullPointerException("Address is null for " + curi + " " +             curi.getVia() + ". Address " +                 ((h.getIpFetched() == CrawlHost.IP_NEVER_LOOKED_UP) ?                     "was never looked up." :                     (System.currentTimeMillis() - h.getIpFetched()) + " ms ago."));      }      return h.getIP().getHostAddress();  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -