📄 pruneindextool.java
字号:
if (!dryrun) { IndexReader.unlock(dir); if (LOG.isDebugEnabled()) { LOG.debug(" - had to unlock index in " + dir); } } } IndexReader r = IndexReader.open(dir); indexes.add(r); numIdx++; } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(dr + "Invalid index in " + indexDirs[i] + " - skipping..."); } } } if (indexes.size() == 0) throw new Exception("No input indexes."); IndexReader[] readers = (IndexReader[])indexes.toArray(new IndexReader[0]); reader = new MultiReader(readers); } if (LOG.isInfoEnabled()) { LOG.info(dr + "Opened " + numIdx + " index(es) with total " + reader.numDocs() + " documents."); } searcher = new IndexSearcher(reader); } /** * This class collects all matching document IDs in a BitSet. * <p>NOTE: the reason to use this API is that the most common way of * performing Lucene queries (Searcher.search(Query)::Hits) does NOT * return all matching documents, because it skips very low scoring hits.</p> * * @author Andrzej Bialecki <ab@getopt.org> */ private static class AllHitsCollector extends HitCollector { private BitSet bits; public AllHitsCollector(BitSet bits) { this.bits = bits; } public void collect(int doc, float score) { bits.set(doc); } } /** * For each query, find all matching documents and delete them from all input * indexes. Optionally, an additional check can be performed by using {@link PruneChecker} * implementations. */ public void run() { BitSet bits = new BitSet(reader.maxDoc()); AllHitsCollector ahc = new AllHitsCollector(bits); boolean doDelete = false; for (int i = 0; i < queries.length; i++) { if (LOG.isInfoEnabled()) { LOG.info(dr + "Processing query: " + queries[i].toString()); } bits.clear(); try { searcher.search(queries[i], ahc); } catch (IOException e) { if (LOG.isWarnEnabled()) { LOG.warn(dr + " - failed: " + e.getMessage()); } continue; } if (bits.cardinality() == 0) { if (LOG.isInfoEnabled()) { LOG.info(dr + " - no matching documents."); } continue; } if (LOG.isInfoEnabled()) { LOG.info(dr + " - found " + bits.cardinality() + " document(s)."); } // Now delete all matching documents int docNum = -1, start = 0, cnt = 0; // probably faster than looping sequentially through all index values? while ((docNum = bits.nextSetBit(start)) != -1) { // don't delete the same document multiple times if (reader.isDeleted(docNum)) continue; try { if (checkers != null && checkers.length > 0) { boolean check = true; for (int k = 0; k < checkers.length; k++) { // fail if any checker returns false check &= checkers[k].isPrunable(queries[i], reader, docNum); } doDelete = check; } else doDelete = true; if (doDelete) { if (!dryrun) reader.deleteDocument(docNum); cnt++; } } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn(dr + " - failed to delete doc #" + docNum); } } start = docNum + 1; } if (LOG.isInfoEnabled()) { LOG.info(dr + " - deleted " + cnt + " document(s)."); } } // close checkers if (checkers != null) { for (int i = 0; i < checkers.length; i++) { checkers[i].close(); } } try { reader.close(); } catch (IOException e) { if (LOG.isWarnEnabled()) { LOG.warn(dr + "Exception when closing reader(s): " + e.getMessage()); } } } public static void main(String[] args) throws Exception { if (args.length == 0) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("Missing arguments"); } return; } File idx = new File(args[0]); if (!idx.isDirectory()) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("Not a directory: " + idx); } return; } Vector paths = new Vector(); if (IndexReader.indexExists(idx)) { paths.add(idx); } else { // try and see if there are segments inside, with index dirs File[] dirs = idx.listFiles(new FileFilter() { public boolean accept(File f) { return f.isDirectory(); } }); if (dirs == null || dirs.length == 0) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("No indexes in " + idx); } return; } for (int i = 0; i < dirs.length; i++) { File sidx = new File(dirs[i], "index"); if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) { paths.add(sidx); } } if (paths.size() == 0) { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("No indexes in " + idx + " or its subdirs."); } return; } } File[] indexes = (File[])paths.toArray(new File[0]); boolean force = false; boolean dryrun = false; String qPath = null; String outPath = null; String fList = null; for (int i = 1; i < args.length; i++) { if (args[i].equals("-force")) { force = true; } else if (args[i].equals("-queries")) { qPath = args[++i]; } else if (args[i].equals("-output")) { outPath = args[++i]; } else if (args[i].equals("-showfields")) { fList = args[++i]; } else if (args[i].equals("-dryrun")) { dryrun = true; } else { usage(); if (LOG.isFatalEnabled()) { LOG.fatal("Unrecognized option: " + args[i]); } return; } } Vector cv = new Vector(); if (fList != null) { StringTokenizer st = new StringTokenizer(fList, ","); Vector tokens = new Vector(); while (st.hasMoreTokens()) tokens.add(st.nextToken()); String[] fields = (String[])tokens.toArray(new String[0]); PruneChecker pc = new PrintFieldsChecker(System.out, fields); cv.add(pc); } if (outPath != null) { StoreUrlsChecker luc = new StoreUrlsChecker(new File(outPath), false); cv.add(luc); } PruneChecker[] checkers = null; if (cv.size() > 0) { checkers = (PruneChecker[])cv.toArray(new PruneChecker[0]); } Query[] queries = null; InputStream is = null; if (qPath != null) { is = new FileInputStream(qPath); } else { Configuration conf = NutchConfiguration.create(); qPath = conf.get("prune.index.tool.queries"); is = conf.getConfResourceAsInputStream(qPath); } if (is == null) { if (LOG.isFatalEnabled()) { LOG.fatal("Can't load queries from " + qPath); } return; } try { queries = parseQueries(is); } catch (Exception e) { if (LOG.isFatalEnabled()) { LOG.fatal("Error parsing queries: " + e.getMessage()); } return; } try { PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, force, dryrun); pit.run(); } catch (Exception e) { if (LOG.isFatalEnabled()) { LOG.fatal("Error running PruneIndexTool: " + e.getMessage()); } return; } } /** * Read a list of Lucene queries from the stream (UTF-8 encoding is assumed). * There should be a single Lucene query per line. Blank lines and comments * starting with '#' are allowed. * <p>NOTE: you may wish to use {@link org.apache.nutch.searcher.Query#main(String[])} * method to translate queries from Nutch format to Lucene format.</p> * @param is InputStream to read from * @return array of Lucene queries * @throws Exception */ public static Query[] parseQueries(InputStream is) throws Exception { BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String line = null; QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer()); Vector queries = new Vector(); while ((line = br.readLine()) != null) { line = line.trim(); //skip blanks and comments if (line.length() == 0 || line.charAt(0) == '#') continue; Query q = qp.parse(line); queries.add(q); } return (Query[])queries.toArray(new Query[0]); } private static void usage() { System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]"); System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n"); System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done."); System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!"); System.err.println("\t-queries filename\tread pruning queries from this file, instead of the"); System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n"); System.err.println("\t-output filename\tstore pruned URLs in a text file"); System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields."); System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude."); System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown."); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -