📄 pruneindextool.java

📁 一些简要的公爵类一些简要的公爵类一些简要的公爵类
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/*
 * Created on Nov 2, 2004
 * Author: Andrzej Bialecki <ab@getopt.org>
 *
 */
package net.nutch.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.util.BitSet;
import java.util.StringTokenizer;
import java.util.Vector;
import java.util.logging.Logger;

import net.nutch.indexer.IndexSegment;
import net.nutch.io.UTF8;
import net.nutch.util.LogFormatter;
import net.nutch.util.NutchConf;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/**
 * This tool prunes existing Nutch indexes of unwanted content. The main method
 * accepts a list of segment directories (containing indexes). These indexes will
 * be pruned of any content that matches one or more query from a list of Lucene
 * queries read from a file (defined in standard config file, or explicitly
 * overridden from command-line). Segments should already be indexed, if some
 * of them are missing indexes then these segments will be skipped.
 * 
 * <p>NOTE 1: Queries are expressed in Lucene's QueryParser syntax, so a knowledge
 * of available Lucene document fields is required. This can be obtained by reading sources
 * of <code>index-basic</code> and <code>index-more</code> plugins, or using tools
 * like <a href="http://www.getopt.org/luke">Luke</a>. During query parsing a
 * WhitespaceAnalyzer is used - this choice has been made to minimize side effects of
 * Analyzer on the final set of query terms. You can use {@link net.nutch.searcher.Query#main(String[])}
 * method to translate queries in Nutch syntax to queries in Lucene syntax.<br>
 * If additional level of control is required, an instance of {@link PruneChecker} can
 * be provided to check each document before it's deleted. The results of all
 * checkers are logically AND-ed, which means that any checker in the chain
 * can veto the deletion of the current document. Two example checker implementations
 * are provided - PrintFieldsChecker prints the values of selected index fields,
 * StoreUrlsChecker stores the URLs of deleted documents to a file. Any of them can
 * be activated by providing respective command-line options.
 * </p>
 * <p>The typical command-line usage is as follows:<br>
 * <blockquote>
 * <code>PruneIndexTool index_dir -dryrun -queries queries.txt -showfields url,title</code><br>
 * This command will just print out fields of matching documents.<br>
 * <code>PruneIndexTool index_dir -queries queries.txt</code><br>
 * This command will actually remove all matching entries, according to the
 * queries read from <code>queries.txt</code> file.
 * </blockquote></p>
 * <p>NOTE 2: This tool removes matching documents ONLY from segment indexes (or
 * from a merged index). In particular it does NOT remove the pages and links
 * from WebDB. This means that unwanted URLs may pop up again when new segments
 * are created. To prevent this, use your own {@link net.nutch.net.URLFilter},
 * or PruneDBTool (under construction...).</p>
 * <p>NOTE 3: This tool uses a low-level Lucene interface to collect all matching
 * documents. For large indexes and broad queries this may result in high memory
 * consumption. If you encounter OutOfMemory exceptions, try to narrow down your
 * queries, or increase the heap size.</p>
 * 
 * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
 */
public class PruneIndexTool implements Runnable {
  public static final Logger LOG = LogFormatter.getLogger("net.nutch.tools.PruneIndexTool");
  /** Log the progress every LOG_STEP number of processed documents. */
  public static int LOG_STEP = 50000;
  
  /**
   * This interface can be used to implement additional checking on matching
   * documents.
   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
   */
  public static interface PruneChecker {
    /**
     * Check whether this document should be pruned. NOTE: this method
     * MUST NOT modify the IndexReader.
     * @param reader index reader to read documents from
     * @param docNum document ID
     * @return true if the document should be deleted, false otherwise.
     */
    public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception;
    /**
     * Close the checker - this could involve flushing output files or somesuch.
     */
    public void close();
  }

  /**
   * This checker's main function is just to print out
   * selected field values from each document, just before
   * they are deleted.
   * 
   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
   */
  public static class PrintFieldsChecker implements PruneChecker {
    private PrintStream ps = null;
    private String[] fields = null;
    
    /**
     * 
     * @param ps an instance of PrintStream to print the information to
     * @param fields a list of Lucene index field names. Values from these
     * fields will be printed for every matching document.
     */
    public PrintFieldsChecker(PrintStream ps, String[] fields) {
      this.ps = ps;
      this.fields = fields;
    }

    public void close() {
      ps.flush();
    }
    
    public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception {
      Document doc = reader.document(docNum);
      StringBuffer sb = new StringBuffer("#" + docNum + ":");
      for (int i = 0; i < fields.length; i++) {
        String[] values = doc.getValues(fields[i]);
        sb.append(" " + fields[i] + "=");
        if (values != null) {
          for (int k = 0; k < values.length; k++) {
            sb.append("[" + values[k] + "]");
          }
        } else sb.append("[null]");
      }
      ps.println(sb.toString());
      return true;
    }
  }

  /**
   * This checker's main function is just to store
   * the URLs of each document to be deleted in a text file.
   * 
   * @author Andrzej Bialecki &lt;ab@getopt.org&gt;
   */
  public static class StoreUrlsChecker implements PruneChecker {
    private BufferedWriter output = null;
    private boolean storeHomeUrl = false;
    
    /**
     * Store the list in a file
     * @param out name of the output file
     */
    public StoreUrlsChecker(File out, boolean storeHomeUrl) throws Exception {
      this.output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out), "UTF-8"));
      this.storeHomeUrl = storeHomeUrl;
    }
    
    public void close() {
      try {
        output.flush();
        output.close();
      } catch (Exception e) {
        LOG.warning("Error closing: " + e.getMessage());
      }
    }
    
    public boolean isPrunable(Query q, IndexReader reader, int docNum) throws Exception {
      Document doc = reader.document(docNum);
      String url = doc.get("url");
      output.write(url); output.write('\n');
      if (storeHomeUrl) {
        // store also the main url
        int idx = url.indexOf("://");
        if (idx != -1) {
          idx = url.indexOf('/', idx + 3);
          if (idx != -1) {
            output.write(url.substring(0, idx + 1) + "\n");
          }
        }
      }
      return true;
    }
  }

  private Query[] queries = null;
  private IndexReader reader = null;
  private IndexSearcher searcher = null;
  private PruneChecker[] checkers = null;
  private boolean dryrun = false;
  private String dr = "";
  
  /**
   * Create an instance of the tool, and open all input indexes.
   * @param indexDirs directories with input indexes. At least one valid index must
   * exist, otherwise an Exception is thrown.
   * @param queries pruning queries. Each query will be processed in turn, and the
   * length of the array must be at least one, otherwise an Exception is thrown.
   * @param checkers if not null, they will be used to perform additional
   * checks on matching documents - each checker's method {@link PruneChecker#isPrunable(Query, IndexReader, int)}
   * will be called in turn, for each matching document, and if it returns true this means that
   * the document should be deleted. A logical AND is performed on the results returned
   * by all checkers (which means that if one of them returns false, the document will
   * not be deleted).
   * @param unlock if true, and if any of the input indexes is locked, forcibly
   * unlock it. Use with care, only when you are sure that other processes don't
   * modify the index at the same time.
   * @param dryrun if set to true, don't change the index, just show what would be done.
   * If false, perform all actions, changing indexes as needed. Note: dryrun doesn't prevent
   * PruneCheckers from performing changes or causing any other side-effects.
   * @throws Exception
   */
  public PruneIndexTool(File[] indexDirs, Query[] queries, PruneChecker[] checkers,
          boolean unlock, boolean dryrun) throws Exception {
    if (indexDirs == null || queries == null)
      throw new Exception("Invalid arguments.");
    if (indexDirs.length == 0 || queries.length == 0)
      throw new Exception("Nothing to do.");
    this.queries = queries;
    this.checkers = checkers;
    this.dryrun = dryrun;
    if (dryrun) dr = "[DRY RUN] ";
    int numIdx = 0;
    if (indexDirs.length == 1) {
      Directory dir = FSDirectory.getDirectory(indexDirs[0], false);
      if (IndexReader.isLocked(dir)) {
        if (!unlock) {
          throw new Exception("Index " + indexDirs[0] + " is locked.");
        }
        if (!dryrun) {
          IndexReader.unlock(dir);
          LOG.fine(" - had to unlock index in " + dir);
        }
      }
      reader = IndexReader.open(dir);
      numIdx = 1;
    } else {
      Directory dir;
      Vector indexes = new Vector(indexDirs.length);
      for (int i = 0; i < indexDirs.length; i++) {
        try {
          dir = FSDirectory.getDirectory(indexDirs[i], false);
          if (IndexReader.isLocked(dir)) {
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -