📄 luceneindexer.java

📁 外国人写的Perl搜索引擎程序
💻 JAVA
字号:
import org.apache.lucene.index.IndexWriter;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import java.io.File;import java.io.BufferedReader;import java.io.FileReader;import java.text.DecimalFormat;import java.util.Date;import java.util.Vector;import java.util.Collections;import java.util.Arrays;/**  * LuceneIndexer - benchmarking app * usage: java LuceneIndexer [-docs MAX_TO_INDEX] [-reps NUM_REPETITIONS]  *  * Recommended options: -server -Xmx500M -XX:CompileThreshold=100 */public class LuceneIndexer {  static File corpusDir = new File("extracted_corpus");  static File indexDir  = new File("lucene_index");  static String[] fileList;  public static void main (String[] args) throws Exception {    // verify that we're running from the right directory    String curDir = new File(".").getCanonicalPath();    if (!curDir.endsWith("benchmarks"))      throw new Exception("Must be run from benchmarks/ ");    // assemble the sorted list of article files    fileList = buildFileList();    // parse command line args    int maxToIndex = fileList.length; // default: index all docs    int numReps    = 1;               // default: run once    int increment  = 0;    boolean store  = false;    String arg;    int i = 0;    while (i < (args.length - 1) && args[i].startsWith("-")) {      arg = args[i++];      if (arg.equals("-docs"))         maxToIndex = Integer.parseInt(args[i++]);      else if (arg.equals("-reps"))        numReps = Integer.parseInt(args[i++]);      else if (arg.equals("-increment"))        increment = Integer.parseInt(args[i++]);      else if (arg.equals("-store")) {        if (Integer.parseInt(args[i++]) != 0)          store = true;      }      else        throw new Exception("Unknown argument: " + arg);    }    increment = increment == 0 ? maxToIndex + 1 : increment;    // start the output    System.out.println("---------------------------------------------------");    // build the index numReps times, then print a final report    float[] times = new float[numReps];    for (int rep = 1; rep <= numReps; rep++) {      // start the clock and build the index      long start = new Date().getTime();       int numIndexed = buildIndex(fileList, maxToIndex, increment, store);        // stop the clock and print a report      long end = new Date().getTime();      float secs = (float)(end - start) / 1000;      times[rep - 1] = secs;      printInterimReport(rep, secs, numIndexed);    }    printFinalReport(times);  }  // Return a lexically sorted list of all article files from all subdirs.  static String[] buildFileList () throws Exception {    File[] articleDirs = corpusDir.listFiles();    Vector filePaths = new Vector();    for (int i = 0; i < articleDirs.length; i++) {      File[] articles = articleDirs[i].listFiles();      for (int j = 0; j < articles.length; j++) {        String path = articles[j].getPath();        if (path.indexOf("article") == -1)          continue;        filePaths.add(path);      }    }    Collections.sort(filePaths);    return (String[])filePaths.toArray(new String[filePaths.size()]);  }  // Initialize an IndexWriter  static IndexWriter initWriter (int count) throws Exception {    boolean create = count > 0 ? false : true;    IndexWriter writer = new IndexWriter(indexDir,       new WhitespaceAnalyzer(), create);      writer.setMaxBufferedDocs(1000);      writer.setUseCompoundFile(false);          return writer;  }  // Build an index, stopping at maxToIndex docs if maxToIndex > 0.  static int buildIndex (String[] fileList, int maxToIndex, int increment,                         boolean store) throws Exception {    IndexWriter writer = initWriter(0);    int docsSoFar = 0;    while (docsSoFar < maxToIndex) {      for (int i = 0; i < fileList.length; i++) {        // add content to index        File f = new File(fileList[i]);        Document doc = new Document();        BufferedReader br = new BufferedReader(new FileReader(f));            try {          // the title is the first line          String title;          if ( (title = br.readLine()) == null)            throw new Exception("Failed to read title");          Field titleField = new Field("title", title, Field.Store.YES,               Field.Index.TOKENIZED, Field.TermVector.NO);          doc.add(titleField);                // the body is the rest          if (store) {            StringBuffer buf = new StringBuffer();            String str;            while ( (str = br.readLine()) != null )              buf.append( str );            String body = buf.toString();            Field bodyField = new Field("body", body, Field.Store.YES,                 Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS);            doc.add(bodyField);          } else {            Field bodyField = new Field("body", br);            doc.add(bodyField);          }          writer.addDocument(doc);        } finally {          br.close();        }        docsSoFar++;        if (docsSoFar >= maxToIndex)          break;        if (docsSoFar % increment == 0) {          writer.close();          writer = initWriter(docsSoFar);        }      }    }    // finish index    int numIndexed = writer.docCount();    writer.optimize();    writer.close();        return numIndexed;  }  // Print out stats for one run.  private static void printInterimReport(int rep, float secs,                                          int numIndexed) {    DecimalFormat secsFormat = new DecimalFormat("#,##0.00");    String secString = secsFormat.format(secs);    System.out.println(rep + "   Secs: " + secString +                        "  Docs: " + numIndexed);  }  // Print out aggregate stats  private static void printFinalReport(float[] times) {    // produce mean and truncated mean    Arrays.sort(times);    float meanTime = 0.0f;    float truncatedMeanTime = 0.0f;    int numToChop = times.length >> 2;    int numKept = 0;    for (int i = 0; i < times.length; i++) {        meanTime += times[i];        // discard fastest 25% and slowest 25% of reps        if (i < numToChop || i >= (times.length - numToChop))            continue;        truncatedMeanTime += times[i];        numKept++;    }    meanTime /= times.length;    truncatedMeanTime /= numKept;    int numDiscarded = times.length - numKept;    DecimalFormat format = new DecimalFormat("#,##0.00");    String meanString = format.format(meanTime);    String truncatedMeanString = format.format(truncatedMeanTime);    // get the Lucene version    Package lucenePackage = org.apache.lucene.LucenePackage.get();    String luceneVersion = lucenePackage.getSpecificationVersion();    System.out.println("---------------------------------------------------");    System.out.println("Lucene " +  luceneVersion);    System.out.println("JVM " + System.getProperty("java.version") +                       " (" + System.getProperty("java.vendor") + ")");    System.out.println(System.getProperty("os.name") + " " +                        System.getProperty("os.version") + " " +                       System.getProperty("os.arch"));    System.out.println("Mean: " + meanString + " secs");    System.out.println("Truncated mean (" +                        numKept + " kept, " +                        numDiscarded + " discarded): " +                         truncatedMeanString + " secs");    System.out.println("---------------------------------------------------");  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -