📄 luceneindexer.java
字号:
import org.apache.lucene.index.IndexWriter;import org.apache.lucene.analysis.WhitespaceAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import java.io.File;import java.io.BufferedReader;import java.io.FileReader;import java.text.DecimalFormat;import java.util.Date;import java.util.Vector;import java.util.Collections;import java.util.Arrays;/** * LuceneIndexer - benchmarking app * usage: java LuceneIndexer [-docs MAX_TO_INDEX] [-reps NUM_REPETITIONS] * * Recommended options: -server -Xmx500M -XX:CompileThreshold=100 */public class LuceneIndexer { static File corpusDir = new File("extracted_corpus"); static File indexDir = new File("lucene_index"); static String[] fileList; public static void main (String[] args) throws Exception { // verify that we're running from the right directory String curDir = new File(".").getCanonicalPath(); if (!curDir.endsWith("benchmarks")) throw new Exception("Must be run from benchmarks/ "); // assemble the sorted list of article files fileList = buildFileList(); // parse command line args int maxToIndex = fileList.length; // default: index all docs int numReps = 1; // default: run once int increment = 0; boolean store = false; String arg; int i = 0; while (i < (args.length - 1) && args[i].startsWith("-")) { arg = args[i++]; if (arg.equals("-docs")) maxToIndex = Integer.parseInt(args[i++]); else if (arg.equals("-reps")) numReps = Integer.parseInt(args[i++]); else if (arg.equals("-increment")) increment = Integer.parseInt(args[i++]); else if (arg.equals("-store")) { if (Integer.parseInt(args[i++]) != 0) store = true; } else throw new Exception("Unknown argument: " + arg); } increment = increment == 0 ? maxToIndex + 1 : increment; // start the output System.out.println("---------------------------------------------------"); // build the index numReps times, then print a final report float[] times = new float[numReps]; for (int rep = 1; rep <= numReps; rep++) { // start the clock and build the index long start = new Date().getTime(); int numIndexed = buildIndex(fileList, maxToIndex, increment, store); // stop the clock and print a report long end = new Date().getTime(); float secs = (float)(end - start) / 1000; times[rep - 1] = secs; printInterimReport(rep, secs, numIndexed); } printFinalReport(times); } // Return a lexically sorted list of all article files from all subdirs. static String[] buildFileList () throws Exception { File[] articleDirs = corpusDir.listFiles(); Vector filePaths = new Vector(); for (int i = 0; i < articleDirs.length; i++) { File[] articles = articleDirs[i].listFiles(); for (int j = 0; j < articles.length; j++) { String path = articles[j].getPath(); if (path.indexOf("article") == -1) continue; filePaths.add(path); } } Collections.sort(filePaths); return (String[])filePaths.toArray(new String[filePaths.size()]); } // Initialize an IndexWriter static IndexWriter initWriter (int count) throws Exception { boolean create = count > 0 ? false : true; IndexWriter writer = new IndexWriter(indexDir, new WhitespaceAnalyzer(), create); writer.setMaxBufferedDocs(1000); writer.setUseCompoundFile(false); return writer; } // Build an index, stopping at maxToIndex docs if maxToIndex > 0. static int buildIndex (String[] fileList, int maxToIndex, int increment, boolean store) throws Exception { IndexWriter writer = initWriter(0); int docsSoFar = 0; while (docsSoFar < maxToIndex) { for (int i = 0; i < fileList.length; i++) { // add content to index File f = new File(fileList[i]); Document doc = new Document(); BufferedReader br = new BufferedReader(new FileReader(f)); try { // the title is the first line String title; if ( (title = br.readLine()) == null) throw new Exception("Failed to read title"); Field titleField = new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO); doc.add(titleField); // the body is the rest if (store) { StringBuffer buf = new StringBuffer(); String str; while ( (str = br.readLine()) != null ) buf.append( str ); String body = buf.toString(); Field bodyField = new Field("body", body, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(bodyField); } else { Field bodyField = new Field("body", br); doc.add(bodyField); } writer.addDocument(doc); } finally { br.close(); } docsSoFar++; if (docsSoFar >= maxToIndex) break; if (docsSoFar % increment == 0) { writer.close(); writer = initWriter(docsSoFar); } } } // finish index int numIndexed = writer.docCount(); writer.optimize(); writer.close(); return numIndexed; } // Print out stats for one run. private static void printInterimReport(int rep, float secs, int numIndexed) { DecimalFormat secsFormat = new DecimalFormat("#,##0.00"); String secString = secsFormat.format(secs); System.out.println(rep + " Secs: " + secString + " Docs: " + numIndexed); } // Print out aggregate stats private static void printFinalReport(float[] times) { // produce mean and truncated mean Arrays.sort(times); float meanTime = 0.0f; float truncatedMeanTime = 0.0f; int numToChop = times.length >> 2; int numKept = 0; for (int i = 0; i < times.length; i++) { meanTime += times[i]; // discard fastest 25% and slowest 25% of reps if (i < numToChop || i >= (times.length - numToChop)) continue; truncatedMeanTime += times[i]; numKept++; } meanTime /= times.length; truncatedMeanTime /= numKept; int numDiscarded = times.length - numKept; DecimalFormat format = new DecimalFormat("#,##0.00"); String meanString = format.format(meanTime); String truncatedMeanString = format.format(truncatedMeanTime); // get the Lucene version Package lucenePackage = org.apache.lucene.LucenePackage.get(); String luceneVersion = lucenePackage.getSpecificationVersion(); System.out.println("---------------------------------------------------"); System.out.println("Lucene " + luceneVersion); System.out.println("JVM " + System.getProperty("java.version") + " (" + System.getProperty("java.vendor") + ")"); System.out.println(System.getProperty("os.name") + " " + System.getProperty("os.version") + " " + System.getProperty("os.arch")); System.out.println("Mean: " + meanString + " secs"); System.out.println("Truncated mean (" + numKept + " kept, " + numDiscarded + " discarded): " + truncatedMeanString + " secs"); System.out.println("---------------------------------------------------"); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -