📄 pruneindextool.java
字号:
if (!unlock) {
LOG.warning(dr + "Index " + indexDirs[i] + " is locked. Skipping...");
continue;
}
if (!dryrun) {
IndexReader.unlock(dir);
LOG.fine(" - had to unlock index in " + dir);
}
}
IndexReader r = IndexReader.open(dir);
indexes.add(r);
numIdx++;
} catch (Exception e) {
LOG.warning(dr + "Invalid index in " + indexDirs[i] + " - skipping...");
}
}
if (indexes.size() == 0) throw new Exception("No input indexes.");
IndexReader[] readers = (IndexReader[])indexes.toArray(new IndexReader[0]);
reader = new MultiReader(readers);
}
LOG.info(dr + "Opened " + numIdx + " index(es) with total " + reader.numDocs() + " documents.");
searcher = new IndexSearcher(reader);
}
/**
* This class collects all matching document IDs in a BitSet.
* <p>NOTE: the reason to use this API is that the most common way of
* performing Lucene queries (Searcher.search(Query)::Hits) does NOT
* return all matching documents, because it skips very low scoring hits.</p>
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
private static class AllHitsCollector extends HitCollector {
private BitSet bits;
public AllHitsCollector(BitSet bits) {
this.bits = bits;
}
public void collect(int doc, float score) {
bits.set(doc);
}
}
/**
* For each query, find all matching documents and delete them from all input
* indexes. Optionally, an additional check can be performed by using {@link PruneChecker}
* implementations.
*/
public void run() {
BitSet bits = new BitSet(reader.maxDoc());
AllHitsCollector ahc = new AllHitsCollector(bits);
boolean doDelete = false;
UTF8 url = new UTF8();
for (int i = 0; i < queries.length; i++) {
LOG.info(dr + "Processing query: " + queries[i].toString());
bits.clear();
try {
searcher.search(queries[i], ahc);
} catch (IOException e) {
LOG.warning(dr + " - failed: " + e.getMessage());
continue;
}
if (bits.cardinality() == 0) {
LOG.info(dr + " - no matching documents.");
continue;
}
LOG.info(dr + " - found " + bits.cardinality() + " document(s).");
// Now delete all matching documents
int docNum = -1, start = 0, cnt = 0;
// probably faster than looping sequentially through all index values?
while ((docNum = bits.nextSetBit(start)) != -1) {
// don't delete the same document multiple times
if (reader.isDeleted(docNum)) continue;
try {
if (checkers != null && checkers.length > 0) {
boolean check = true;
for (int k = 0; k < checkers.length; k++) {
// fail if any checker returns false
check &= checkers[k].isPrunable(queries[i], reader, docNum);
}
doDelete = check;
} else doDelete = true;
if (doDelete) {
if (!dryrun) reader.delete(docNum);
cnt++;
}
} catch (Exception e) {
LOG.warning(dr + " - failed to delete doc #" + docNum);
}
start = docNum + 1;
}
LOG.info(dr + " - deleted " + cnt + " document(s).");
}
// close checkers
if (checkers != null) {
for (int i = 0; i < checkers.length; i++) {
checkers[i].close();
}
}
try {
reader.close();
} catch (IOException e) {
LOG.warning(dr + "Exception when closing reader(s): " + e.getMessage());
}
}
public static void main(String[] args) throws Exception {
if (args.length == 0) {
usage();
LOG.severe("Missing arguments");
return;
}
File idx = new File(args[0]);
if (!idx.isDirectory()) {
usage();
LOG.severe("Not a directory: " + idx);
return;
}
Vector paths = new Vector();
if (IndexReader.indexExists(idx)) {
paths.add(idx);
} else {
// try and see if there are segments inside, with index dirs
File[] dirs = idx.listFiles(new FileFilter() {
public boolean accept(File f) {
return f.isDirectory();
}
});
if (dirs == null || dirs.length == 0) {
usage();
LOG.severe("No indexes in " + idx);
return;
}
for (int i = 0; i < dirs.length; i++) {
File sidx = new File(dirs[i], IndexSegment.IDX_REG_NAME);
if (sidx.exists() && sidx.isDirectory() && IndexReader.indexExists(sidx)) {
paths.add(sidx);
}
}
if (paths.size() == 0) {
usage();
LOG.severe("No indexes in " + idx + " or its subdirs.");
return;
}
}
File[] indexes = (File[])paths.toArray(new File[0]);
boolean force = false;
boolean dryrun = false;
String qPath = null;
String outPath = null;
String fList = null;
for (int i = 1; i < args.length; i++) {
if (args[i].equals("-force")) {
force = true;
} else if (args[i].equals("-queries")) {
qPath = args[++i];
} else if (args[i].equals("-output")) {
outPath = args[++i];
} else if (args[i].equals("-showfields")) {
fList = args[++i];
} else if (args[i].equals("-dryrun")) {
dryrun = true;
} else {
usage();
LOG.severe("Unrecognized option: " + args[i]);
return;
}
}
Vector cv = new Vector();
if (fList != null) {
StringTokenizer st = new StringTokenizer(fList, ",");
Vector tokens = new Vector();
while (st.hasMoreTokens()) tokens.add(st.nextToken());
String[] fields = (String[])tokens.toArray(new String[0]);
PruneChecker pc = new PrintFieldsChecker(System.out, fields);
cv.add(pc);
}
if (outPath != null) {
StoreUrlsChecker luc = new StoreUrlsChecker(new File(outPath), false);
cv.add(luc);
}
PruneChecker[] checkers = null;
if (cv.size() > 0) {
checkers = (PruneChecker[])cv.toArray(new PruneChecker[0]);
}
Query[] queries = null;
InputStream is = null;
if (qPath != null) {
is = new FileInputStream(qPath);
} else {
qPath = NutchConf.get("prune.index.tool.queries");
is = NutchConf.getConfResourceAsInputStream(qPath);
}
if (is == null) {
LOG.severe("Can't load queries from " + qPath);
return;
}
try {
queries = parseQueries(is);
} catch (Exception e) {
LOG.severe("Error parsing queries: " + e.getMessage());
return;
}
try {
PruneIndexTool pit = new PruneIndexTool(indexes, queries, checkers, true, dryrun);
pit.run();
} catch (Exception e) {
LOG.severe("Error running PruneIndexTool: " + e.getMessage());
return;
}
}
/**
* Read a list of Lucene queries from the stream (UTF-8 encoding is assumed).
* There should be a single Lucene query per line. Blank lines and comments
* starting with '#' are allowed.
* <p>NOTE: you may wish to use {@link net.nutch.searcher.Query#main(String[])}
* method to translate queries from Nutch format to Lucene format.</p>
* @param is InputStream to read from
* @return array of Lucene queries
* @throws Exception
*/
public static Query[] parseQueries(InputStream is) throws Exception {
BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8"));
String line = null;
QueryParser qp = new QueryParser("url", new WhitespaceAnalyzer());
Vector queries = new Vector();
while ((line = br.readLine()) != null) {
line = line.trim();
//skip blanks and comments
if (line.length() == 0 || line.charAt(0) == '#') continue;
Query q = qp.parse(line);
queries.add(q);
}
return (Query[])queries.toArray(new Query[0]);
}
private static void usage() {
System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]");
System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n");
System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done.");
System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!");
System.err.println("\t-queries filename\tread pruning queries from this file, instead of the");
System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n");
System.err.println("\t-output filename\tstore pruned URLs in a text file");
System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields.");
System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude.");
System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown.");
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -