📄 webdbinjector.java

📁 爬虫数据的改进,并修正了一些bug
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
         * From time to time the Parser will set the "current location"
         * by calling this function.  It's useful for emitting locations
         * for error messages.
         */
        public void setDocumentLocator(Locator locator) {
            location = locator;
        }


        //
        // Interface ErrorHandler
        //

        /**
         * Emit the exception message
         */
        public void error(SAXParseException spe) {
            LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage());
            spe.printStackTrace(System.out);
        }

        /**
         * Emit the exception message, with line numbers
         */
        public void fatalError(SAXParseException spe) {
            LOG.severe("Fatal error: " + spe.toString() + ": " + spe.getMessage());
            LOG.severe("Last known line is " + location.getLineNumber() + ", column " + location.getColumnNumber());
            spe.printStackTrace(System.out);
        }
        
        /**
         * Emit exception warning message
         */
        public void warning(SAXParseException spe) {
            LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage());
            spe.printStackTrace(System.out);
        }
    }

    private IWebDBWriter dbWriter;

    /**
     * WebDBInjector takes a reference to a WebDBWriter that it should add to.
     */
    public WebDBInjector(IWebDBWriter dbWriter) {
        this.dbWriter = dbWriter;
    }

    /**
     * Close dbWriter and save changes
     */
    public void close() throws IOException {
        dbWriter.close();
    }

    /**
     * Utility to present small status bar
     */
    public void printStatusBar(int small, int big){
        if ((pages % small ) == 0) {
            System.out.print(".");
        }
        if ((pages % big ) == 0) {
            printStatus();
        }
    }

    long startTime = System.currentTimeMillis();
    long pages = 0;
    long nextFetch = System.currentTimeMillis();

    /**
     * Utility to present performance stats
     */
    public void printStatus(){
        long elapsed = (System.currentTimeMillis() - this.startTime); 
        if ( this.pages == 0) {
        } else {
            LOG.info("\t" + this.pages + "\t" + 
                     (int)((1000 *  pages)/elapsed) + " pages/second\t" );
        }
    }

    /**
     * Iterate through all the items in this flat text file and
     * add them to the db.
     */
    public void injectURLFile(File urlList) throws IOException {
        nextFetch = urlList.lastModified();
        BufferedReader reader = new BufferedReader(new FileReader(urlList));
        try {
            String curStr = null; 
            LOG.info("Starting URL processing");
            while ((curStr = reader.readLine()) != null) {
                String url = curStr.trim();
                if (addPage(url))
                  this.pages++;
                printStatusBar(2000,50000);
            }
            LOG.info("Added " + pages + " pages");
        } catch (Exception e) {
          LOG.severe("error while injecting:" + e);
          e.printStackTrace();
        } finally {
          reader.close();
        }
    }

    /**
     * Iterate through all the items in this structured DMOZ file.
     * Add each URL to the web db.
     */
    public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew, Pattern topicPattern) throws IOException, SAXException, ParserConfigurationException {
        nextFetch = dmozFile.lastModified();

        SAXParserFactory parserFactory = SAXParserFactory.newInstance();
        SAXParser parser = parserFactory.newSAXParser();
        XMLReader reader = parser.getXMLReader();

        // Create our own processor to receive SAX events
        RDFProcessor rp =
          new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew, topicPattern);
        reader.setContentHandler(rp);
        reader.setErrorHandler(rp);
        LOG.info("skew = " + rp.hashSkew);

        //
        // Open filtered text stream.  The UTF8Filter makes sure that
        // only appropriate XML-approved UTF8 characters are received.
        // Any non-conforming characters are silently skipped.
        //
        XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
        try {
            InputSource is = new InputSource(in);
            reader.parse(is);
        } catch (Exception e) {
            LOG.severe(e.toString());
            e.printStackTrace(System.out);
            System.exit(0);
        } finally {
            in.close();
        }
    }

    private boolean addPage(String url) throws IOException {
      url = URLFilterFactory.getFilter().filter(url);
      if (url != null) {
        try {
          Page page = new Page(url, NEW_INJECTED_PAGE_SCORE, nextFetch);
          dbWriter.addPageIfNotPresent(page);
          return true;
        } catch (MalformedURLException e) {
          LOG.warning("bad url: "+url);
        }
      }
      return false;
    }

    private static void addTopicsFromFile(String topicFile, Vector topics) throws IOException {
      BufferedReader in = null;
      try {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(topicFile), "UTF-8"));
        String line = null;
        while ((line = in.readLine()) != null) {
          topics.addElement(new String(line));
        }
      } 
      catch (Exception e) {
        LOG.severe(e.toString());
        e.printStackTrace(System.out);
        System.exit(0);
      } finally {
       in.close();
      }
    }
    

    /**
     * Command-line access.  User may add URLs via a flat text file
     * or the structured DMOZ file.  By default, we ignore Adult
     * material (as categorized by DMOZ).
     */
    public static void main(String argv[]) throws Exception {
      if (argv.length < 3) {
        System.out.println("Usage: WebDBInjector (-local | -ndfs <namenode:port>) <db_dir> (-urlfile <url_file> | -dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-noDmozDesc] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
        return;
      }

      //
      // Parse the command line, figure out what kind of
      // URL file we need to load
      //
      int subsetDenom = 1;
      int skew = 0;
      String command = null, loadfile = null;
      boolean includeAdult = false, includeDmozDesc = true;
      Pattern topicPattern = null; 
      Vector topics = new Vector(); 

      int i = 0;
      NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
      try {
          File root = new File(argv[i++]);

          for (; i < argv.length; i++) {
              if ("-urlfile".equals(argv[i]) || 
                  "-dmozfile".equals(argv[i])) {
                  command = argv[i];
                  loadfile = argv[i+1];
                  i++;
              } else if ("-includeAdultMaterial".equals(argv[i])) {
                  includeAdult = true;
              } else if ("-noDmozDesc".equals(argv[i])) {
                  includeDmozDesc = false;
              } else if ("-subset".equals(argv[i])) {
                  subsetDenom = Integer.parseInt(argv[i+1]);
                  i++;
              } else if ("-topic".equals(argv[i])) {
                  topics.addElement(argv[i+1]); 
                  i++;
              } else if ("-topicFile".equals(argv[i])) {
                  addTopicsFromFile(argv[i+1], topics);
                  i++;
              } else if ("-skew".equals(argv[i])) {
                  skew = Integer.parseInt(argv[i+1]);
                  i++;
              }
          }

          //
          // Create the webdbWriter, the injector, and then inject the
          // right kind of URL file.
          //
          IWebDBWriter writer = new WebDBWriter(nfs, root);
          WebDBInjector injector = new WebDBInjector(writer);
          try {
              if ("-urlfile".equals(command)) {
                  if (!topics.isEmpty()) {
                      System.out.println("You can't select URLs based on a topic when usin a URL-file");
                  }
                  injector.injectURLFile(new File(loadfile));
              } else if ("-dmozfile".equals(command)) {
                  if (!topics.isEmpty()) {
                      String regExp = new String("^("); 
                      int j = 0;
                      for ( ; j < topics.size() - 1; ++j) {
                          regExp = regExp.concat((String) topics.get(j));
                          regExp = regExp.concat("|");
                      }
                      regExp = regExp.concat((String) topics.get(j));
                      regExp = regExp.concat(").*"); 
                      LOG.info("Topic selection pattern = " + regExp);
                      topicPattern = Pattern.compile(regExp); 
                  }
                  injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, includeDmozDesc, skew, topicPattern);
              } else {
                  System.out.println("No command indicated.");
                  return;
              }
          } finally {
              injector.close();
          }
      } finally {
          nfs.close();
      }
    }
}
上一页 12
💿 文件大小 146 K
👤 上传用户 beixinning
📂 所属分类其他
🏷️ 相关标签

#bug #数据 #正
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -