experiment.java

来自「一个使用的搜索引擎」· Java 代码 · 共 256 行
JAVA
256 行
package ir.eval;import java.io.*;import java.util.*;import java.lang.*;import ir.utilities.*;import ir.vsr.*;/** * Contains methods for running evaluation experiments for information * retrieval, specifically the generation of recall-precision curves * for a given test corpus of query/relevant-documents pairs. * * @author Ray Mooney */public class Experiment {    /** The standard recall levels for which we want to plot precision values */    public static final double[] RECALL_LEVELS = {0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0};    /** The directory from which the indexed documents come. */    public File corpusDir = null;    /** The file with the list of queries and results to be tested.      * Assumes this file consists of 3 lines for each query:     * 1) A line of text for the query.     * 2) A line of filenames from corpusDir that are relevant to this     *    query, filenames must be separated by a space.     * 3) A blank line as a separator from the next query. */    public File queryFile = null;    /** The output file where final recall/precision result data is printed. */    public File outFile = null;    /** The inverted index created for the corpus */    InvertedIndex index = null;    /* List of recall-precision data for each query, where data for each     * query is an ArrayList of RecallPrecisionPair's for each possible     * recall point that can be determined from the ranked retrievals */    ArrayList rpResults = new ArrayList();    /* List of interpolatedPrecision values for each query, where each element     * is a double[] of interpolated precision values for each of the     * standard recall levels in RECALL_LEVELS. See textbook for details. */    ArrayList interpolatedPrecisions = new ArrayList();    /* Array of final average precision values for each of the     * standard recall levels in RECALL_LEVELS */    double[]  averagePrecisions = null;    /** Create an Experiment object for generating Recall/Precision curves     * @param corpusDir The directory of files to index.     * @param queryFile The file of query/relevant-docs pairs to evaluate.     * @param outFile File for output precision/recall data.     * @param docType The type of documents to index (See docType in DocumentIterator).     * @param stem Whether tokens should be stemmed with Porter stemmer.     */    public Experiment(File corpusDir, File queryFile, File outFile, short docType, boolean stem) 	throws IOException { 	this.corpusDir = corpusDir;	this.index = new InvertedIndex(corpusDir, docType, stem, false);	this.queryFile = queryFile;	this.outFile = outFile;    }    /** Process and evaluate all queries and generate recall-precision curve */    public void makeRpCurve() throws IOException {	processQueries();	// Use rpResults generated to interpolate a precision values for	// each standard recall level for each query and store results in 	// interpolatedPrecisions	for(int i = 0; i < rpResults.size(); i++) {	    interpolatedPrecisions.add(interpolatePrecision((ArrayList)rpResults.get(i)));	}	// Compute the average precision values	averagePrecisions = MoreMath.averageVectors(interpolatedPrecisions);	System.out.println("\nAverage Interpolated Precisions:");	MoreMath.printVector(averagePrecisions);	System.out.println("");	// Write results to output file and Gnuplot file for graphing	writeRpCurve();	graphRpCurve();    }	    /* Process each query in the queryFile and store evaluated results     * in rpResults      */    void processQueries() throws IOException {	BufferedReader in = new BufferedReader(new FileReader(queryFile));	while(processQuery(in));	in.close();	// System.out.println("\n" + rpResults);    }    /* Process the next query read from the query file reader and evaluate     * results compared to known relevant docs also read from the query file.     * @return true if query succesfully read, else false if no more queries     * in query file      */    boolean processQuery(BufferedReader in) throws IOException {	String query = in.readLine();   // get the query	if (query == null) return false;  // return false if end of file	System.out.println("\nQuery " + (rpResults.size() + 1) + ": " + query);	// Process the query and get the ranked retrievals	Retrieval[] retrievals = index.retrieve(query);	System.out.println("Returned " + retrievals.length + " documents.");	// Read the known relevant docs from query file and parse them	// into an ArrayList of String's of relevant file names.	String line = in.readLine();	ArrayList correctRetrievals = MoreString.segment(line,' ');	System.out.println(correctRetrievals.size() + " truly relevant documents.");	// Generate Recall/Precision points and save in rpResults	rpResults.add(evalRetrievals(retrievals, correctRetrievals));	// Read the blank line delimiter between queries in the query file	line = in.readLine();	if (! (line == null || line.trim().equals(""))) {	    System.out.println("\nCould not find blank line after query, bad queryFile format");	    System.exit(1);	}	return true;    }    /* Compare retrieved docs to relevant docs and compute recall/precision     * points.  Goes down ranked retrievals in order, stopping at each     * relevant document and computing a RecallPrecisionPair for thresholding     * at that point.     * @return An ArrayList of RecallPrecisionPair's     */    ArrayList evalRetrievals(Retrieval[] retrievals, ArrayList correctRetrievals) {	ArrayList rpList = new ArrayList();	// Number of correctly retrieved docs at any given point	double goodRetrievals = 0; 	// Examine each ranked retrieval in order to compute rp pairs	for(int i = 0; i < retrievals.length; i++) {	    // Current number of retrievals considered	    int numRetrieved = i + 1;	    // Check if this retrieval is in the list of relevant docs	    if (correctRetrievals.contains(retrievals[i].docRef.file.getName())) {		goodRetrievals++;  // This is a relevant retrieval		// Compute recall and precision for first numRetrieved docs		double recall = goodRetrievals / correctRetrievals.size();		double precision = goodRetrievals / numRetrieved;		System.out.println(MoreString.padToLeft(numRetrieved, 4) + 				   " is relevant; Recall = " +				   MoreString.padToLeft(MoreMath.roundTo(100*recall,3) + "%", 7) +				   "; Precision = " +				   MoreString.padToLeft(MoreMath.roundTo(100*precision,3) + "%",7));		// Create a RecallPrecisionPair for this point and add to rpList		rpList.add(new RecallPrecisionPair(recall, precision));	    }	}	return rpList;    }    /* Interpolate precision values for each standard recall level     * in RECALL_LEVELS from the list of rpPairs for a given query.     * See textbook for details.     */    double[] interpolatePrecision(ArrayList rpPairs) {	// Array of interpolated precisions	double[] precisions = new double[RECALL_LEVELS.length];	// Compute precision value for each recall level, starting	// from the highest and working backwards	for(int i = RECALL_LEVELS.length - 1; i >= 0; i--) {	    // Stores maximum precision for this recall level.	    // Interpolated precision at level i is the max	    // precision seen (or interpolated) at any recall	    // value between level i and level i+1, inclusive.	    double maxPrecision = 0.0;	    // Check each point in rpPairs to see if it is between	    // recall levels i and i+1, compute the max of these precision values.	    for(int j = 0; j < rpPairs.size(); j++) {		RecallPrecisionPair rpPair = (RecallPrecisionPair)rpPairs.get(j);		if (RECALL_LEVELS[i] <= rpPair.recall &&		    (i == RECALL_LEVELS.length - 1 ||  // no higher level i+1		     rpPair.recall <= RECALL_LEVELS[i+1])) {		    // If recall in correct interval, update max precision		    if (rpPair.precision > maxPrecision)			maxPrecision = rpPair.precision;		}	    }	    // Also consider the previously computed precision level for	    // the next highest recall level i+1, to include in max computation	    if (i != RECALL_LEVELS.length - 1 && precisions[i+1] > maxPrecision)		maxPrecision = precisions[i+1];	    // Set precision at level i to be the proper max interpolated value	    precisions[i] = maxPrecision;	}  //  	System.out.print("\nInterpolated Precisions: "); //	MoreMath.printVector(precisions);	// Return vector of final interpolated precisions	return precisions;    }    /* Write out the final interpolated recall/precision graph data.     * One line for each recall/precision point in the form: 'R-value P-value'.     * This is the format needed for GNUPLOT.     */    void writeRpCurve() throws IOException{	PrintWriter out = new PrintWriter(new FileWriter(outFile));	for(int i = 0; i < RECALL_LEVELS.length; i++)	    out.println(RECALL_LEVELS[i] + " " + averagePrecisions[i]);	out.close();    }    /* Write out an appropriate input file for GNUPLOT for the final recall     * precision graph to the output file with a ".gplot" extension.     * See GNUPLOT documentation. */    void graphRpCurve( ) throws IOException{	File graphFile = new File(outFile.getPath() + ".gplot");	PrintWriter out = new PrintWriter(new FileWriter(graphFile));	out.print("set xlabel \"Recall\"\nset ylabel \"Precision\"\n\nset terminal postscript color\nset size 0.75,0.75\n\nset data style linespoints\nset key 0.75,0.9\n\nset xrange [0:1]\nset yrange [0:1]\n\nplot \'" + outFile.getName() + "\' title \"VSR\"");	out.close();    }    /** Evaluate retrieval preformance on a given query test corpus and     * generate a recall/precision graph.     * Command format: "Experiment [OPTION]* [DIR] [QUERIES] [OUTFILE]" where:     * DIR is the name of the directory whose files should be indexed.     * QUERIES is a file of queries paired with relevant docs (see queryFile).     * OUTFILE is the name of the file to put the output. The plot     * data for the recall precision curve is stored in this file and a     * gnuplot file for the graph is the same name with a ".gplot" extension.     * OPTIONs can be     * "-html" to specify HTML files whose HTML tags should be removed, and     * "-stem" to specify tokens should be stemmed with Porter stemmer.     */    public static void main(String[] args) throws IOException {	// Parse the arguments into a directory name and optional flag	String corpusDir = args[args.length - 3];	String queryFile = args[args.length - 2];	String outFile = args[args.length - 1];	short docType = DocumentIterator.TYPE_TEXT;	boolean stem = false;	for(int i = 0; i < args.length - 3; i++) {		   String flag = args[i];	   if (flag.equals("-html"))		// Create HTMLFileDocuments to filter HTML tags		docType = DocumentIterator.TYPE_HTML;	   else if (flag.equals("-stem"))	       // Stem tokens with Porter stemmer	       stem = true;	   else {	       System.out.println("\nUnknown flag: " + flag);	       System.exit(1);	   }	}	Experiment exper = new Experiment(new File(corpusDir), new File(queryFile), 					  new File(outFile), docType, stem);        exper.makeRpCurve();    }}
experiment.java - 源码说明

本页面展示了「一个使用的搜索引擎」中的 experiment.java 源码文件，采用 Java 编程语言编写，共 256 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与搜索引擎相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?