⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pruneindextool.java

📁 nutch0.8源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
            if (!dryrun) {              IndexReader.unlock(dir);              if (LOG.isDebugEnabled()) {                LOG.debug(" - had to unlock index in " + dir);              }            }          }          IndexReader r = IndexReader.open(dir);          indexes.add(r);          numIdx++;        } catch (Exception e) {          if (LOG.isWarnEnabled()) {            LOG.warn(dr + "Invalid index in " + indexDirs[i] + " - skipping...");          }        }      }      if (indexes.size() == 0) throw new Exception("No input indexes.");      IndexReader[] readers = (IndexReader[])indexes.toArray(new IndexReader[0]);      reader = new MultiReader(readers);    }    if (LOG.isInfoEnabled()) {      LOG.info(dr + "Opened " + numIdx + " index(es) with total " +               reader.numDocs() + " documents.");    }    searcher = new IndexSearcher(reader);  }    /**   * This class collects all matching document IDs in a BitSet.   * <p>NOTE: the reason to use this API is that the most common way of   * performing Lucene queries (Searcher.search(Query)::Hits) does NOT   * return all matching documents, because it skips very low scoring hits.</p>   *    * @author Andrzej Bialecki &lt;ab@getopt.org&gt;   */  private static class AllHitsCollector extends HitCollector {    private BitSet bits;        public AllHitsCollector(BitSet bits) {      this.bits = bits;    }    public void collect(int doc, float score) {      bits.set(doc);    }  }    /**   * For each query, find all matching documents and delete them from all input   * indexes. Optionally, an additional check can be performed by using {@link PruneChecker}   * implementations.   */  public void run() {    BitSet bits = new BitSet(reader.maxDoc());    AllHitsCollector ahc = new AllHitsCollector(bits);    boolean doDelete = false;    for (int i = 0; i < queries.length; i++) {      if (LOG.isInfoEnabled()) {        LOG.info(dr + "Processing query: " + queries[i].toString());      }      bits.clear();      try {        searcher.search(queries[i], ahc);      } catch (IOException e) {        if (LOG.isWarnEnabled()) {          LOG.warn(dr + " - failed: " + e.getMessage());        }        continue;      }      if (bits.cardinality() == 0) {        if (LOG.isInfoEnabled()) {          LOG.info(dr + " - no matching documents.");        }        continue;      }      if (LOG.isInfoEnabled()) {        LOG.info(dr + " - found " + bits.cardinality() + " document(s).");      }      // Now delete all matching documents      int docNum = -1, start = 0, cnt = 0;      // probably faster than looping sequentially through all index values?      while ((docNum = bits.nextSetBit(start)) != -1) {        // don't delete the same document multiple times        if (reader.isDeleted(docNum)) continue;        try {          if (checkers != null && checkers.length > 0) {            boolean check = true;            for (int k = 0; k < checkers.length; k++) {              // fail if any checker returns false              check &= checkers[k].isPrunable(queries[i], reader, docNum);            }            doDelete = check;          } else doDelete = true;          if (doDelete) {            if (!dryrun) reader.deleteDocument(docNum);            cnt++;          }        } catch (Exception e) {          if (LOG.isWarnEnabled()) {            LOG.warn(dr + " - failed to delete doc #" + docNum);          }        }        start = docNum + 1;      }      if (LOG.isInfoEnabled()) {        LOG.info(dr + " - deleted " + cnt + " document(s).");      }    }    // close checkers    if (checkers != null) {      for (int i = 0; i < checkers.length; i++) {        checkers[i].close();      }    }    try {      reader.close();    } catch (IOException e) {      if (LOG.isWarnEnabled()) {        LOG.warn(dr + "Exception when closing reader(s): " + e.getMessage());      }    }  }    public static void main(String[] args) throws Exception {    if (args.length == 0) {      usage();      if (LOG.isFatalEnabled()) { LOG.fatal("Missing arguments"); }      return;    }    File idx = new File(args[0]);    if (!idx.isDirectory()) {      usage();      if (LOG.isFatalEnabled()) { LOG.fatal("Not a directory: " + idx); }      return;    }    Vector paths = new Vector();    if (IndexReader.indexExists(idx)) {      paths.add(idx);    } else {      // try and see if there are segments inside, with index dirs      File[] dirs = idx.listFiles(new FileFilter() {        public boolean accept(File f) {          return f.isDirectory();        }      });      if (dirs == null || dirs.length == 0) {        usage();        if (LOG.isFatalEnabled()) { LOG.fatal("No indexes in " + idx); }        return;      }      for (int i = 0; i < dirs.length; i++) {        File sidx = new File(dirs[i], "index");        if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) {          paths.add(sidx);        }      }      if (paths.size() == 0) {        usage();        if (LOG.isFatalEnabled()) {          LOG.fatal("No indexes in " + idx + " or its subdirs.");        }        return;      }    }    File[] indexes = (File[])paths.toArray(new File[0]);    boolean force = false;    boolean dryrun = false;    String qPath = null;    String outPath = null;    String fList = null;    for (int i = 1; i < args.length; i++) {      if (args[i].equals("-force")) {        force = true;      } else if (args[i].equals("-queries")) {        qPath = args[++i];      } else if (args[i].equals("-output")) {        outPath = args[++i];      } else if (args[i].equals("-showfields")) {        fList = args[++i];      } else if (args[i].equals("-dryrun")) {        dryrun = true;      } else {        usage();        if (LOG.isFatalEnabled()) {          LOG.fatal("Unrecognized option: " + args[i]);        }        return;      }    }    Vector cv = new Vector();    if (fList != null) {      StringTokenizer st = new StringTokenizer(fList, ",");      Vector tokens = new Vector();      while (st.hasMoreTokens()) tokens.add(st.nextToken());      String[] fields = (String[])tokens.toArray(new String[0]);      PruneChecker pc = new PrintFieldsChecker(System.out, fields);      cv.add(pc);    }        if (outPath != null) {      StoreUrlsChecker luc = new StoreUrlsChecker(new File(outPath), false);      cv.add(luc);    }    PruneChecker[] checkers = null;    if (cv.size() > 0) {      checkers = (PruneChecker[])cv.toArray(new PruneChecker[0]);    }    Query[] queries = null;    InputStream is = null;    if (qPath != null) {      is = new FileInputStream(qPath);    } else {        Configuration conf = NutchConfiguration.create();        qPath = conf.get("prune.index.tool.queries");        is = conf.getConfResourceAsInputStream(qPath);    }    if (is == null) {      if (LOG.isFatalEnabled()) {        LOG.fatal("Can't load queries from " + qPath);      }      return;    }    try {      queries = parseQueries(is);    } catch (Exception e) {      if (LOG.isFatalEnabled()) {        LOG.fatal("Error parsing queries: " + e.getMessage());      }      return;    }    try {      PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, force, dryrun);      pit.run();    } catch (Exception e) {      if (LOG.isFatalEnabled()) {        LOG.fatal("Error running PruneIndexTool: " + e.getMessage());      }      return;    }  }    /**   * Read a list of Lucene queries from the stream (UTF-8 encoding is assumed).   * There should be a single Lucene query per line. Blank lines and comments   * starting with '#' are allowed.   * <p>NOTE: you may wish to use {@link org.apache.nutch.searcher.Query#main(String[])}   * method to translate queries from Nutch format to Lucene format.</p>   * @param is InputStream to read from   * @return array of Lucene queries   * @throws Exception   */  public static Query[] parseQueries(InputStream is) throws Exception {    BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));    String line = null;    QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer());    Vector queries = new Vector();    while ((line = br.readLine()) != null) {      line = line.trim();      //skip blanks and comments      if (line.length() == 0 || line.charAt(0) == '#') continue;      Query q = qp.parse(line);      queries.add(q);    }    return (Query[])queries.toArray(new Query[0]);  }    private static void usage() {    System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]");    System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n");    System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done.");    System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!");    System.err.println("\t-queries filename\tread pruning queries from this file, instead of the");    System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n");    System.err.println("\t-output filename\tstore pruned URLs in a text file");    System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields.");    System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude.");    System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown.");  }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -