📄 fetcher.java

📁 爬虫数据的改进,并修正了一些bug
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.fetcher;

import java.io.IOException;
import java.io.File;
import java.util.Properties;

import net.nutch.pagedb.FetchListEntry;
import net.nutch.io.*;
import net.nutch.db.*;
import net.nutch.fs.*;
import net.nutch.util.*;
import net.nutch.protocol.*;
import net.nutch.parse.*;
import net.nutch.plugin.*;

import java.util.logging.*;

/**
 * The fetcher. Most of the work is done by plugins.
 *
 * <p>
 * Note by John Xing: As of 20041022, option -noParsing is introduced.
 * Without this option, fetcher behaves the old way, i.e., it not only
 * crawls but also parses content. With option -noParsing, fetcher
 * does crawl only. Use ParseSegment.java to parse fetched contents.
 * Check FetcherOutput.java and ParseSegment.java for further description.
 */
public class Fetcher {

  public static final Logger LOG =
    LogFormatter.getLogger("net.nutch.fetcher.Fetcher");

  static {
    if (NutchConf.getBoolean("fetcher.verbose", false)) {
      setLogLevel(Level.FINE);
    }
  }

  private ArrayFile.Reader fetchList;             // the input
  private ArrayFile.Writer fetcherWriter;         // the output
  private ArrayFile.Writer contentWriter;
  private ArrayFile.Writer parseTextWriter;
  private ArrayFile.Writer parseDataWriter;

  private String name;                            // name of the segment
  private long start;                             // start time of fetcher run
  private long bytes;                             // total bytes fetched
  private int pages;                              // total pages fetched
  private int errors;                             // total pages errored

  private boolean parsing = true;                 // whether do parsing

  private int threadCount =                       // max number of threads
    NutchConf.getInt("fetcher.threads.fetch", 10);

  // All threads (FetcherThread or thread started by it) belong to
  // group "fetcher". Each FetcherThread is named as "fetcherXX",
  // where XX is the order it's started.
  private static final String THREAD_GROUP_NAME = "fetcher";

  private ThreadGroup group = new ThreadGroup(THREAD_GROUP_NAME); // our group

  // count of FetcherThreads that are through the loop and just about to return
  private int atCompletion = 0;

  /********************************************
   * Fetcher thread
   ********************************************/
  private class FetcherThread extends Thread {

    public FetcherThread(String name) { super(group, name); }

    /**
     * This thread keeps looping, grabbing an item off the list
     * of URLs to be fetched (in a thread-safe way).  It checks 
     * whether the URL is OK to download.  If so, we do it.
     */
    public void run() {

      FetchListEntry fle = new FetchListEntry();

      while (true) {
        if (LogFormatter.hasLoggedSevere())       // something bad happened
          break;                                  // exit
        
        String url = null;
        try {

          if (fetchList.next(fle) == null)
            break;

          url = fle.getPage().getURL().toString();

          if (!fle.getFetch()) {                  // should we fetch this page?
            if (LOG.isLoggable(Level.FINE))
              LOG.fine("not fetching " + url);
            handleNoFetch(fle, FetcherOutput.SUCCESS);
            continue;
          }

          LOG.info("fetching " + url);            // fetch the page

          Protocol protocol = ProtocolFactory.getProtocol(url);
          Content content = protocol.getContent(url);

          handleFetch(url, fle, content);

          synchronized (Fetcher.this) {           // update status
            pages++;
            bytes += content.getContent().length;
            if ((pages % 100) == 0) {             // show status every 100pp
              status();
            }
          }
        } catch (ResourceGone e) {                // don't retry
          logError(url, fle, e);
          handleNoFetch(fle, FetcherOutput.NOT_FOUND);

        // dealt with in handleFetch() below
        //} catch (ParseException e) {              // don't retry
        //  logError(url, fle, e);
        //  handleNoFetch(fle, FetcherOutput.CANT_PARSE);

        } catch (RetryLater e) {                  // explicit retry
          logError(url, fle, e);
          handleNoFetch(fle, FetcherOutput.RETRY);

        } catch (ProtocolException e) {           // implicit retry
          logError(url, fle, e);
          handleNoFetch(fle, FetcherOutput.RETRY);

        } catch (Throwable t) {                   // an unchecked exception
          if (fle != null) {
            logError(url, fle, t);                // retry?
            handleNoFetch(fle, FetcherOutput.RETRY);
          }
        }
      }

      // Explicitly invoke shutDown() for all possible plugins.
      // Done by the FetcherThread finished the last.
      synchronized (Fetcher.this) {
        atCompletion++;
        if (atCompletion == threadCount) {
          try {
            PluginRepository.getInstance().finalize();
          } catch (java.lang.Throwable t) {
            // do nothing
          }
        }
      }
      return;
    }

    private void logError(String url, FetchListEntry fle, Throwable t) {
      LOG.info("fetch of " + url + " failed with: " + t);
      LOG.log(Level.FINE, "stack", t);            // stack trace
      synchronized (Fetcher.this) {               // record failure
        errors++;
      }
    }

    private void handleFetch(String url, FetchListEntry fle, Content content) {
      if (!Fetcher.this.parsing) {
        outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
                                    FetcherOutput.SUCCESS),
                content, null, null);
        return;
      }

      try {
        String contentType = content.getContentType();
        Parser parser = ParserFactory.getParser(contentType, url);
        Parse parse = parser.getParse(content);
        outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
                                    FetcherOutput.SUCCESS),
                content, new ParseText(parse.getText()), parse.getData());
      } catch (ParseException e) {
        // 20041026, xing
        // If fetching succeeds, but parsing fails, content should be saved
        // so that we can try to parse again in separate pass, possibly
        // using better/alternative parser.
        LOG.info("fetch okay, but can't parse " + url + ", reason: "
          + e.getMessage());
        outputPage(new FetcherOutput(fle, MD5Hash.digest(content.getContent()),
                                    FetcherOutput.CANT_PARSE),
                content, new ParseText(""),
                new ParseData("", new Outlink[0], new Properties()));
      }
    }

    private void handleNoFetch(FetchListEntry fle, int status) {
      String url = fle.getPage().getURL().toString();
      MD5Hash hash = MD5Hash.digest(url);

      if (Fetcher.this.parsing) {
        outputPage(new FetcherOutput(fle, hash, status),
                   new Content(url, url, new byte[0], "", new Properties()),
                   new ParseText(""),
                   new ParseData("", new Outlink[0], new Properties()));
      } else {
        outputPage(new FetcherOutput(fle, hash, status),
                   new Content(url, url, new byte[0], "", new Properties()),
                   null, null);
      }
    }
      
    private void outputPage(FetcherOutput fo, Content content,
                            ParseText text, ParseData parseData) {
      try {
        synchronized (fetcherWriter) {
          fetcherWriter.append(fo);
          contentWriter.append(content);
          if (Fetcher.this.parsing) {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -