📄 pruneindextool.java

📁 一些简要的公爵类一些简要的公爵类一些简要的公爵类
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
            if (!unlock) {
              LOG.warning(dr + "Index " + indexDirs[i] + " is locked. Skipping...");
              continue;
            }
            if (!dryrun) {
              IndexReader.unlock(dir);
              LOG.fine(" - had to unlock index in " + dir);
            }
          }
          IndexReader r = IndexReader.open(dir);
          indexes.add(r);
          numIdx++;
        } catch (Exception e) {
          LOG.warning(dr + "Invalid index in " + indexDirs[i] + " - skipping...");
        }
      }
      if (indexes.size() == 0) throw new Exception("No input indexes.");
      IndexReader[] readers = (IndexReader[])indexes.toArray(new IndexReader[0]);
      reader = new MultiReader(readers);
    }
    LOG.info(dr + "Opened " + numIdx + " index(es) with total " + reader.numDocs() + " documents.");
    searcher = new IndexSearcher(reader);
  }
  
  /**
   * This class collects all matching document IDs in a BitSet.
   * <p>NOTE: the reason to use this API is that the most common way of
   * performing Lucene queries (Searcher.search(Query)::Hits) does NOT
   * return all matching documents, because it skips very low scoring hits.</p>
   * 
   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
   */
  private static class AllHitsCollector extends HitCollector {
    private BitSet bits;
    
    public AllHitsCollector(BitSet bits) {
      this.bits = bits;
    }
    public void collect(int doc, float score) {
      bits.set(doc);
    }
  }
  
  /**
   * For each query, find all matching documents and delete them from all input
   * indexes. Optionally, an additional check can be performed by using {@link PruneChecker}
   * implementations.
   */
  public void run() {
    BitSet bits = new BitSet(reader.maxDoc());
    AllHitsCollector ahc = new AllHitsCollector(bits);
    boolean doDelete = false;
    UTF8 url = new UTF8();
    for (int i = 0; i < queries.length; i++) {
      LOG.info(dr + "Processing query: " + queries[i].toString());
      bits.clear();
      try {
        searcher.search(queries[i], ahc);
      } catch (IOException e) {
        LOG.warning(dr + " - failed: " + e.getMessage());
        continue;
      }
      if (bits.cardinality() == 0) {
        LOG.info(dr + " - no matching documents.");
        continue;
      }
      LOG.info(dr + " - found " + bits.cardinality() + " document(s).");
      // Now delete all matching documents
      int docNum = -1, start = 0, cnt = 0;
      // probably faster than looping sequentially through all index values?
      while ((docNum = bits.nextSetBit(start)) != -1) {
        // don't delete the same document multiple times
        if (reader.isDeleted(docNum)) continue;
        try {
          if (checkers != null && checkers.length > 0) {
            boolean check = true;
            for (int k = 0; k < checkers.length; k++) {
              // fail if any checker returns false
              check &= checkers[k].isPrunable(queries[i], reader, docNum);
            }
            doDelete = check;
          } else doDelete = true;
          if (doDelete) {
            if (!dryrun) reader.delete(docNum);
            cnt++;
          }
        } catch (Exception e) {
          LOG.warning(dr + " - failed to delete doc #" + docNum);
        }
        start = docNum + 1;
      }
      LOG.info(dr + " - deleted " + cnt + " document(s).");
    }
    // close checkers
    if (checkers != null) {
      for (int i = 0; i < checkers.length; i++) {
        checkers[i].close();
      }
    }
    try {
      reader.close();
    } catch (IOException e) {
      LOG.warning(dr + "Exception when closing reader(s): " + e.getMessage());
    }
  }
  
  public static void main(String[] args) throws Exception {
    if (args.length == 0) {
      usage();
      LOG.severe("Missing arguments");
      return;
    }
    File idx = new File(args[0]);
    if (!idx.isDirectory()) {
      usage();
      LOG.severe("Not a directory: " + idx);
      return;
    }
    Vector paths = new Vector();
    if (IndexReader.indexExists(idx)) {
      paths.add(idx);
    } else {
      // try and see if there are segments inside, with index dirs
      File[] dirs = idx.listFiles(new FileFilter() {
        public boolean accept(File f) {
          return f.isDirectory();
        }
      });
      if (dirs == null || dirs.length == 0) {
        usage();
        LOG.severe("No indexes in " + idx);
        return;
      }
      for (int i = 0; i < dirs.length; i++) {
        File sidx = new File(dirs[i], IndexSegment.IDX_REG_NAME);
        if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) {
          paths.add(sidx);
        }
      }
      if (paths.size() == 0) {
        usage();
        LOG.severe("No indexes in " + idx + " or its subdirs.");
        return;
      }
    }
    File[] indexes = (File[])paths.toArray(new File[0]);
    boolean force = false;
    boolean dryrun = false;
    String qPath = null;
    String outPath = null;
    String fList = null;
    for (int i = 1; i < args.length; i++) {
      if (args[i].equals("-force")) {
        force = true;
      } else if (args[i].equals("-queries")) {
        qPath = args[++i];
      } else if (args[i].equals("-output")) {
        outPath = args[++i];
      } else if (args[i].equals("-showfields")) {
        fList = args[++i];
      } else if (args[i].equals("-dryrun")) {
        dryrun = true;
      } else {
        usage();
        LOG.severe("Unrecognized option: " + args[i]);
        return;
      }
    }
    Vector cv = new Vector();
    if (fList != null) {
      StringTokenizer st = new StringTokenizer(fList, ",");
      Vector tokens = new Vector();
      while (st.hasMoreTokens()) tokens.add(st.nextToken());
      String[] fields = (String[])tokens.toArray(new String[0]);
      PruneChecker pc = new PrintFieldsChecker(System.out, fields);
      cv.add(pc);
    }
    
    if (outPath != null) {
      StoreUrlsChecker luc = new StoreUrlsChecker(new File(outPath), false);
      cv.add(luc);
    }

    PruneChecker[] checkers = null;
    if (cv.size() > 0) {
      checkers = (PruneChecker[])cv.toArray(new PruneChecker[0]);
    }
    Query[] queries = null;
    InputStream is = null;
    if (qPath != null) {
      is = new FileInputStream(qPath);
    } else {
      qPath = NutchConf.get("prune.index.tool.queries");
      is = NutchConf.getConfResourceAsInputStream(qPath);
    }
    if (is == null) {
      LOG.severe("Can't load queries from " + qPath);
      return;
    }
    try {
      queries = parseQueries(is);
    } catch (Exception e) {
      LOG.severe("Error parsing queries: " + e.getMessage());
      return;
    }
    try {
      PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, true, dryrun);
      pit.run();
    } catch (Exception e) {
      LOG.severe("Error running PruneIndexTool: " + e.getMessage());
      return;
    }
  }
  
  /**
   * Read a list of Lucene queries from the stream (UTF-8 encoding is assumed).
   * There should be a single Lucene query per line. Blank lines and comments
   * starting with '#' are allowed.
   * <p>NOTE: you may wish to use {@link net.nutch.searcher.Query#main(String[])}
   * method to translate queries from Nutch format to Lucene format.</p>
   * @param is InputStream to read from
   * @return array of Lucene queries
   * @throws Exception
   */
  public static Query[] parseQueries(InputStream is) throws Exception {
    BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
    String line = null;
    QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer());
    Vector queries = new Vector();
    while ((line = br.readLine()) != null) {
      line = line.trim();
      //skip blanks and comments
      if (line.length() == 0 || line.charAt(0) == '#') continue;
      Query q = qp.parse(line);
      queries.add(q);
    }
    return (Query[])queries.toArray(new Query[0]);
  }
  
  private static void usage() {
    System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]");
    System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n");
    System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done.");
    System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!");
    System.err.println("\t-queries filename\tread pruning queries from this file, instead of the");
    System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n");
    System.err.println("\t-output filename\tstore pruned URLs in a text file");
    System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields.");
    System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude.");
    System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown.");
  }
}
上一页 12
💿 文件大小 78 K
👤 上传用户 foreigngirl
📂 所属分类 Java编程
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -