📄 pruneindextool.java
字号:
/* * Created on Nov 2, 2004 * Author: Andrzej Bialecki <ab@getopt.org> * *//** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.nutch.tools;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileFilter;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.OutputStreamWriter;import java.io.PrintStream;import java.util.BitSet;import java.util.StringTokenizer;import java.util.Vector;// Commons Logging importsimport org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.util.NutchConfiguration;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.MultiReader;import org.apache.lucene.queryParser.QueryParser;import org.apache.lucene.search.HitCollector;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;/** * This tool prunes existing Nutch indexes of unwanted content. The main method * accepts a list of segment directories (containing indexes). These indexes will * be pruned of any content that matches one or more query from a list of Lucene * queries read from a file (defined in standard config file, or explicitly * overridden from command-line). Segments should already be indexed, if some * of them are missing indexes then these segments will be skipped. * * <p>NOTE 1: Queries are expressed in Lucene's QueryParser syntax, so a knowledge * of available Lucene document fields is required. This can be obtained by reading sources * of <code>index-basic</code> and <code>index-more</code> plugins, or using tools * like <a href="http://www.getopt.org/luke">Luke</a>. During query parsing a * WhitespaceAnalyzer is used - this choice has been made to minimize side effects of * Analyzer on the final set of query terms. You can use {@link org.apache.nutch.searcher.Query#main(String[])} * method to translate queries in Nutch syntax to queries in Lucene syntax.<br> * If additional level of control is required, an instance of {@link PruneChecker} can * be provided to check each document before it's deleted. The results of all * checkers are logically AND-ed, which means that any checker in the chain * can veto the deletion of the current document. Two example checker implementations * are provided - PrintFieldsChecker prints the values of selected index fields, * StoreUrlsChecker stores the URLs of deleted documents to a file. Any of them can * be activated by providing respective command-line options. * </p> * <p>The typical command-line usage is as follows:<br> * <blockquote> * <code>PruneIndexTool index_dir -dryrun -queries queries.txt -showfields url,title</code><br> * This command will just print out fields of matching documents.<br> * <code>PruneIndexTool index_dir -queries queries.txt</code><br> * This command will actually remove all matching entries, according to the * queries read from <code>queries.txt</code> file. * </blockquote></p> * <p>NOTE 2: This tool removes matching documents ONLY from segment indexes (or * from a merged index). In particular it does NOT remove the pages and links * from WebDB. This means that unwanted URLs may pop up again when new segments * are created. To prevent this, use your own {@link org.apache.nutch.net.URLFilter}, * or PruneDBTool (under construction...).</p> * <p>NOTE 3: This tool uses a low-level Lucene interface to collect all matching * documents. For large indexes and broad queries this may result in high memory * consumption. If you encounter OutOfMemory exceptions, try to narrow down your * queries, or increase the heap size.</p> * * @author Andrzej Bialecki <ab@getopt.org> */public class PruneIndexTool implements Runnable { public static final Log LOG = LogFactory.getLog(PruneIndexTool.class); /** Log the progress every LOG_STEP number of processed documents. */ public static int LOG_STEP = 50000; /** * This interface can be used to implement additional checking on matching * documents. * @author Andrzej Bialecki <ab@getopt.org> */ public static interface PruneChecker { /** * Check whether this document should be pruned. NOTE: this method * MUST NOT modify the IndexReader. * @param reader index reader to read documents from * @param docNum document ID * @return true if the document should be deleted, false otherwise. */ public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception; /** * Close the checker - this could involve flushing output files or somesuch. */ public void close(); } /** * This checker's main function is just to print out * selected field values from each document, just before * they are deleted. * * @author Andrzej Bialecki <ab@getopt.org> */ public static class PrintFieldsChecker implements PruneChecker { private PrintStream ps = null; private String[] fields = null; /** * * @param ps an instance of PrintStream to print the information to * @param fields a list of Lucene index field names. Values from these * fields will be printed for every matching document. */ public PrintFieldsChecker(PrintStream ps, String[] fields) { this.ps = ps; this.fields = fields; } public void close() { ps.flush(); } public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception { Document doc = reader.document(docNum); StringBuffer sb = new StringBuffer("#" + docNum + ":"); for (int i = 0; i < fields.length; i++) { String[] values = doc.getValues(fields[i]); sb.append(" " + fields[i] + "="); if (values != null) { for (int k = 0; k < values.length; k++) { sb.append("[" + values[k] + "]"); } } else sb.append("[null]"); } ps.println(sb.toString()); return true; } } /** * This checker's main function is just to store * the URLs of each document to be deleted in a text file. * * @author Andrzej Bialecki <ab@getopt.org> */ public static class StoreUrlsChecker implements PruneChecker { private BufferedWriter output = null; private boolean storeHomeUrl = false; /** * Store the list in a file * @param out name of the output file */ public StoreUrlsChecker(File out, boolean storeHomeUrl) throws Exception { this.output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8")); this.storeHomeUrl = storeHomeUrl; } public void close() { try { output.flush(); output.close(); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Error closing: " + e.getMessage()); } } } public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception { Document doc = reader.document(docNum); String url = doc.get("url"); output.write(url); output.write('\n'); if (storeHomeUrl) { // store also the main url int idx = url.indexOf("://"); if (idx != -1) { idx = url.indexOf('/', idx + 3); if (idx != -1) { output.write(url.substring(0, idx + 1) + "\n"); } } } return true; } } private Query[] queries = null; private IndexReader reader = null; private IndexSearcher searcher = null; private PruneChecker[] checkers = null; private boolean dryrun = false; private String dr = ""; /** * Create an instance of the tool, and open all input indexes. * @param indexDirs directories with input indexes. At least one valid index must * exist, otherwise an Exception is thrown. * @param queries pruning queries. Each query will be processed in turn, and the * length of the array must be at least one, otherwise an Exception is thrown. * @param checkers if not null, they will be used to perform additional * checks on matching documents - each checker's method {@link PruneChecker#isPrunable(Query, IndexReader, int)} * will be called in turn, for each matching document, and if it returns true this means that * the document should be deleted. A logical AND is performed on the results returned * by all checkers (which means that if one of them returns false, the document will * not be deleted). * @param unlock if true, and if any of the input indexes is locked, forcibly * unlock it. Use with care, only when you are sure that other processes don't * modify the index at the same time. * @param dryrun if set to true, don't change the index, just show what would be done. * If false, perform all actions, changing indexes as needed. Note: dryrun doesn't prevent * PruneCheckers from performing changes or causing any other side-effects. * @throws Exception */ public PruneIndexTool(File[] indexDirs, Query[] queries, PruneChecker[] checkers, boolean unlock, boolean dryrun) throws Exception { if (indexDirs == null || queries == null) throw new Exception("Invalid arguments."); if (indexDirs.length == 0 || queries.length == 0) throw new Exception("Nothing to do."); this.queries = queries; this.checkers = checkers; this.dryrun = dryrun; if (dryrun) dr = "[DRY RUN] "; int numIdx = 0; if (indexDirs.length == 1) { Directory dir = FSDirectory.getDirectory(indexDirs[0], false); if (IndexReader.isLocked(dir)) { if (!unlock) { throw new Exception("Index " + indexDirs[0] + " is locked."); } if (!dryrun) { IndexReader.unlock(dir); if (LOG.isDebugEnabled()) { LOG.debug(" - had to unlock index in " + dir); } } } reader = IndexReader.open(dir); numIdx = 1; } else { Directory dir; Vector indexes = new Vector(indexDirs.length); for (int i = 0; i < indexDirs.length; i++) { try { dir = FSDirectory.getDirectory(indexDirs[i], false); if (IndexReader.isLocked(dir)) { if (!unlock) { if (LOG.isWarnEnabled()) { LOG.warn(dr + "Index " + indexDirs[i] + " is locked. Skipping..."); } continue; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -