⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 segmentreader.java

📁 爬虫数据的改进,并修正了一些bug
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
    if (parseTextReader != null) {
      if (pt != null) {
        if (parseTextReader.get(n, pt) == null) valid = false;
      } else parseTextReader.seek(n);
    }
    if (parseDataReader != null) {
      if (pd != null) {
        if (parseDataReader.get(n, pd) == null) valid = false;
      } else parseDataReader.seek(n);
    }
    key = n;
    return valid;
  }
  
  private ParseText _pt = new ParseText();
  private ParseData _pd = new ParseData();
  
  /** Read values from all open readers. Note: even if some of the storage objects
   * are null, but if respective readers are open, an underlying next() operation will
   * be performed for all streams anyway, to ensure that the whole entry is valid.
   */
  public synchronized boolean next(ParseText pt, ParseData pd) throws IOException {
    boolean valid = true;
    ParseText rpt = (pt == null) ? _pt : pt;
    ParseData rpd = (pd == null) ? _pd : pd;

    if (parseTextReader != null)
      if (parseTextReader.next(rpt) == null) valid = false;
    if (parseDataReader != null)
      if (parseDataReader.next(rpd) == null) valid = false;
    key++;
    return valid;
  }
  
  /** Seek to a position in all readers. */
  public synchronized void seek(long n) throws IOException {
    if (parseTextReader != null) parseTextReader.seek(n);
    if (parseDataReader != null) parseDataReader.seek(n);
    key = n;
  }

  /** Return the current key position. */
  public long key() {
    return key;
  }

  /** Reset all readers. */
  public synchronized void reset() throws IOException {
    if (parseTextReader != null) parseTextReader.reset();
    if (parseDataReader != null) parseDataReader.reset();
  }

  /** Close all readers. */
  public synchronized void close() {
    if (parseTextReader != null) try {
      parseTextReader.close();
    } catch (Exception e) {};
    if (parseDataReader != null) try {
      parseDataReader.close();
    } catch (Exception e) {};
  }
  
  /**
   * Dump the segment's content in human-readable format.
   * @param sorted if true, sort segment entries by URL (ascending). If false,
   * output entries in the order they occur in the segment.
   * @param output where to dump to
   * @throws Exception
   */
  /****
  public synchronized void dump(boolean sorted, PrintStream output) throws Exception {
    reset();
    FetcherOutput fo = new FetcherOutput();
    Content co = new Content();
    ParseData pd = new ParseData();
    ParseText pt = new ParseText();
    long recNo = 0L;
    if (!sorted) {
      while(next(pt, pd)) {
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
    } else {
      File unsortedFile = new File(segmentDir, ".unsorted");
      File sortedFile = new File(segmentDir, ".sorted");
      nfs.delete(unsortedFile);
      nfs.delete(sortedFile);
      SequenceFile.Writer seqWriter = new SequenceFile.Writer(nfs,
              unsortedFile.toString(), UTF8.class, LongWritable.class);
      FetchListEntry fle;
      LongWritable rec = new LongWritable();
      UTF8 url = new UTF8();
      String urlString;
      while (fetcherReader.next(fo) != null) {
        fle = fo.getFetchListEntry();
        urlString = fle.getPage().getURL().toString();
        rec.set(recNo);
        url.set(urlString);
        seqWriter.append(url, rec);
        recNo++;
      }
      seqWriter.close();
      // sort the SequenceFile
      long start = System.currentTimeMillis();

      SequenceFile.Sorter sorter = new SequenceFile.Sorter(nfs,
              new UTF8.Comparator(), LongWritable.class);

      sorter.sort(unsortedFile.toString(), sortedFile.toString());

      float localSecs = (System.currentTimeMillis() - start) / 1000.0f;
      LOG.info(" - sorted: " + recNo + " entries in " + localSecs + "s, "
        + (recNo/localSecs) + " entries/s");

      nfs.delete(unsortedFile);
      SequenceFile.Reader seqReader = new SequenceFile.Reader(nfs, sortedFile.toString());
      while (seqReader.next(url, rec)) {
        recNo = rec.get();
        get(recNo, fo, co, pt, pd);
        output.println("Recno:: " + recNo++);
        output.println("FetcherOutput::\n" + fo.toString());
        if (contentReader != null)
          output.println("Content::\n" + co.toString());
        if (parseDataReader != null)
          output.println("ParseData::\n" + pd.toString());
        if (parseTextReader != null)
          output.println("ParseText::\n" + pt.toString());
        output.println("");
      }
      seqReader.close();
      nfs.delete(sortedFile);
    }
  }
***/
  /** Command-line wrapper. Run without arguments to see usage help. */
  public static void main(String[] args) throws Exception {
    if (args.length == 0) {
      usage();
      return;
    }
    SegmentReader reader = null;
    NutchFileSystem nfs = NutchFileSystem.parseArgs(args, 0);
    String segDir = null;
    Vector dirs = new Vector();
    boolean fix = false;
    boolean list = false;
    boolean dump = false;
    boolean sorted = false;
    boolean withParseText = true;
    boolean withParseData = true;
    boolean withContent = true;
    for (int i = 0; i < args.length; i++) {
      if (args[i] != null) {
        if (args[i].equals("-noparsetext")) withParseText = false;
        else if (args[i].equals("-noparsedata")) withParseData = false;
        else if (args[i].equals("-nocontent")) withContent = false;
        else if (args[i].equals("-fix")) fix = true;
        else if (args[i].equals("-dump")) dump = true;
        else if (args[i].equals("-dumpsort")) {
          dump = true;
          sorted = true;
        } else if (args[i].equals("-list")) list = true;
        else if (args[i].equals("-dir")) segDir = args[++i];
        else dirs.add(new File(args[i]));
      }
    }
    if (segDir != null) {
      File sDir = new File(segDir);
      if (!sDir.exists() || !sDir.isDirectory()) {
        LOG.warn("Invalid path: " + sDir);
      } else {
        File[] files = sDir.listFiles(new FileFilter() {
          public boolean accept(File f) {
            return f.isDirectory();
          }
        });
        if (files != null && files.length > 0) {
          for (int i = 0; i < files.length; i++) dirs.add(files[i]);
        }
      }
    }
    if (dirs.size() == 0) {
      LOG.warn("No input segment dirs.");
      usage();
      return;
    }
    long total = 0L;
    int cnt = 0;
    SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd'-'HH:mm:ss");
    DecimalFormat df = new DecimalFormat("########");
    df.setParseIntegerOnly(true);
    if (list)
      LOG.info("PARSED?\tSTARTED\t\t\tFINISHED\t\tCOUNT\tDIR NAME");
    for (int i = 0; i < dirs.size(); i++) {
      File dir = (File)dirs.get(i);
      try {
        reader = new SegmentReader(nfs, dir,
              withParseText, withParseData, fix);
        if (list) {
          LOG.info(df.format(reader.size) +
                  "\t" + dir);
        }
        total += reader.size;
        cnt++;
        //if (dump) reader.dump(sorted, System.out);
      } catch (Throwable t) {
        LOG.error(t.getMessage());
      }
    }
    if (list)
      LOG.info("TOTAL: " + total + " entries in " + cnt + " segments.");
  }
  
  private static void usage() {
    System.err.println("SegmentReader [-fix] [-dump] [-dumpsort] [-list] [-nocontent] [-noparsedata] [-noparsetext] (-dir segments | seg1 seg2 ...)");
    System.err.println("\tNOTE: at least one segment dir name is required, or '-dir' option.");
    System.err.println("\t-fix\t\tautomatically fix corrupted segments");
    System.err.println("\t-dump\t\tdump segment data in human-readable format");
    System.err.println("\t-dumpsort\tdump segment data in human-readable format, sorted by URL");
    System.err.println("\t-list\t\tprint useful information about segments");
    System.err.println("\t-nocontent\tignore content data");
    System.err.println("\t-noparsedata\tignore parse_data data");
    System.err.println("\t-nocontent\tignore parse_text data");
    System.err.println("\t-dir segments\tdirectory containing multiple segments");
    System.err.println("\tseg1 seg2 ...\tsegment directories\n");
  }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -