📄 pruneindextool.java
字号:
/*
* Created on Nov 2, 2004
* Author: Andrzej Bialecki <ab@getopt.org>
*
*/
package net.nutch.tools;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.util.BitSet;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.logging.Logger;
import net.nutch.indexer.IndexSegment;
import net.nutch.io.UTF8;
import net.nutch.util.LogFormatter;
import net.nutch.util.NutchConf;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
/**
* This tool prunes existing Nutch indexes of unwanted content. The main method
* accepts a list of segment directories (containing indexes). These indexes will
* be pruned of any content that matches one or more query from a list of Lucene
* queries read from a file (defined in standard config file, or explicitly
* overridden from command-line). Segments should already be indexed, if some
* of them are missing indexes then these segments will be skipped.
*
* <p>NOTE 1: Queries are expressed in Lucene's QueryParser syntax, so a knowledge
* of available Lucene document fields is required. This can be obtained by reading sources
* of <code>index-basic</code> and <code>index-more</code> plugins, or using tools
* like <a href="http://www.getopt.org/luke">Luke</a>. During query parsing a
* WhitespaceAnalyzer is used - this choice has been made to minimize side effects of
* Analyzer on the final set of query terms. You can use {@link net.nutch.searcher.Query#main(String[])}
* method to translate queries in Nutch syntax to queries in Lucene syntax.<br>
* If additional level of control is required, an instance of {@link PruneChecker} can
* be provided to check each document before it's deleted. The results of all
* checkers are logically AND-ed, which means that any checker in the chain
* can veto the deletion of the current document. Two example checker implementations
* are provided - PrintFieldsChecker prints the values of selected index fields,
* StoreUrlsChecker stores the URLs of deleted documents to a file. Any of them can
* be activated by providing respective command-line options.
* </p>
* <p>The typical command-line usage is as follows:<br>
* <blockquote>
* <code>PruneIndexTool index_dir -dryrun -queries queries.txt -showfields url,title</code><br>
* This command will just print out fields of matching documents.<br>
* <code>PruneIndexTool index_dir -queries queries.txt</code><br>
* This command will actually remove all matching entries, according to the
* queries read from <code>queries.txt</code> file.
* </blockquote></p>
* <p>NOTE 2: This tool removes matching documents ONLY from segment indexes (or
* from a merged index). In particular it does NOT remove the pages and links
* from WebDB. This means that unwanted URLs may pop up again when new segments
* are created. To prevent this, use your own {@link net.nutch.net.URLFilter},
* or PruneDBTool (under construction...).</p>
* <p>NOTE 3: This tool uses a low-level Lucene interface to collect all matching
* documents. For large indexes and broad queries this may result in high memory
* consumption. If you encounter OutOfMemory exceptions, try to narrow down your
* queries, or increase the heap size.</p>
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
public class PruneIndexTool implements Runnable {
public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.PruneIndexTool");
/** Log the progress every LOG_STEP number of processed documents. */
public static int LOG_STEP = 50000;
/**
* This interface can be used to implement additional checking on matching
* documents.
* @author Andrzej Bialecki <ab@getopt.org>
*/
public static interface PruneChecker {
/**
* Check whether this document should be pruned. NOTE: this method
* MUST NOT modify the IndexReader.
* @param reader index reader to read documents from
* @param docNum document ID
* @return true if the document should be deleted, false otherwise.
*/
public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception;
/**
* Close the checker - this could involve flushing output files or somesuch.
*/
public void close();
}
/**
* This checker's main function is just to print out
* selected field values from each document, just before
* they are deleted.
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
public static class PrintFieldsChecker implements PruneChecker {
private PrintStream ps = null;
private String[] fields = null;
/**
*
* @param ps an instance of PrintStream to print the information to
* @param fields a list of Lucene index field names. Values from these
* fields will be printed for every matching document.
*/
public PrintFieldsChecker(PrintStream ps, String[] fields) {
this.ps = ps;
this.fields = fields;
}
public void close() {
ps.flush();
}
public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception {
Document doc = reader.document(docNum);
StringBuffer sb = new StringBuffer("#" + docNum + ":");
for (int i = 0; i < fields.length; i++) {
String[] values = doc.getValues(fields[i]);
sb.append(" " + fields[i] + "=");
if (values != null) {
for (int k = 0; k < values.length; k++) {
sb.append("[" + values[k] + "]");
}
} else sb.append("[null]");
}
ps.println(sb.toString());
return true;
}
}
/**
* This checker's main function is just to store
* the URLs of each document to be deleted in a text file.
*
* @author Andrzej Bialecki <ab@getopt.org>
*/
public static class StoreUrlsChecker implements PruneChecker {
private BufferedWriter output = null;
private boolean storeHomeUrl = false;
/**
* Store the list in a file
* @param out name of the output file
*/
public StoreUrlsChecker(File out, boolean storeHomeUrl) throws Exception {
this.output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8"));
this.storeHomeUrl = storeHomeUrl;
}
public void close() {
try {
output.flush();
output.close();
} catch (Exception e) {
LOG.warning("Error closing: " + e.getMessage());
}
}
public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception {
Document doc = reader.document(docNum);
String url = doc.get("url");
output.write(url); output.write('\n');
if (storeHomeUrl) {
// store also the main url
int idx = url.indexOf("://");
if (idx != -1) {
idx = url.indexOf('/', idx + 3);
if (idx != -1) {
output.write(url.substring(0, idx + 1) + "\n");
}
}
}
return true;
}
}
private Query[] queries = null;
private IndexReader reader = null;
private IndexSearcher searcher = null;
private PruneChecker[] checkers = null;
private boolean dryrun = false;
private String dr = "";
/**
* Create an instance of the tool, and open all input indexes.
* @param indexDirs directories with input indexes. At least one valid index must
* exist, otherwise an Exception is thrown.
* @param queries pruning queries. Each query will be processed in turn, and the
* length of the array must be at least one, otherwise an Exception is thrown.
* @param checkers if not null, they will be used to perform additional
* checks on matching documents - each checker's method {@link PruneChecker#isPrunable(Query, IndexReader, int)}
* will be called in turn, for each matching document, and if it returns true this means that
* the document should be deleted. A logical AND is performed on the results returned
* by all checkers (which means that if one of them returns false, the document will
* not be deleted).
* @param unlock if true, and if any of the input indexes is locked, forcibly
* unlock it. Use with care, only when you are sure that other processes don't
* modify the index at the same time.
* @param dryrun if set to true, don't change the index, just show what would be done.
* If false, perform all actions, changing indexes as needed. Note: dryrun doesn't prevent
* PruneCheckers from performing changes or causing any other side-effects.
* @throws Exception
*/
public PruneIndexTool(File[] indexDirs, Query[] queries, PruneChecker[] checkers,
boolean unlock, boolean dryrun) throws Exception {
if (indexDirs == null || queries == null)
throw new Exception("Invalid arguments.");
if (indexDirs.length == 0 || queries.length == 0)
throw new Exception("Nothing to do.");
this.queries = queries;
this.checkers = checkers;
this.dryrun = dryrun;
if (dryrun) dr = "[DRY RUN] ";
int numIdx = 0;
if (indexDirs.length == 1) {
Directory dir = FSDirectory.getDirectory(indexDirs[0], false);
if (IndexReader.isLocked(dir)) {
if (!unlock) {
throw new Exception("Index " + indexDirs[0] + " is locked.");
}
if (!dryrun) {
IndexReader.unlock(dir);
LOG.fine(" - had to unlock index in " + dir);
}
}
reader = IndexReader.open(dir);
numIdx = 1;
} else {
Directory dir;
Vector indexes = new Vector(indexDirs.length);
for (int i = 0; i < indexDirs.length; i++) {
try {
dir = FSDirectory.getDirectory(indexDirs[i], false);
if (IndexReader.isLocked(dir)) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -