📄 invertedindex.java

📁 这是一个用于测试用的搜索引擎的案例
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
	    double count = entry.getValue().getValue();	    // Determine the score added to the similarity of each document	    // indexed under this token and update the length of the	    // query vector with the square of the weight for this token.	    queryLength = queryLength + incorporateToken(token, count, retrievalHash);	}	// Finalize the length of the query vector by taking the square-root of the	// final sum of squares of its token wieghts.	queryLength = Math.sqrt(queryLength);	// Make an array to store the final ranked Retrievals.	Retrieval[] retrievals = new Retrieval[retrievalHash.size()];	// Iterate through each of the retreived docuements stored in	// the final retrievalHash.	int retrievalCount = 0;	for (Map.Entry<DocumentReference,DoubleValue> entry : retrievalHash.entrySet()) {	    DocumentReference docRef = entry.getKey();	    double score = entry.getValue().value;	    // Normalize score for the lengths of the two document vectors	    score = score / (queryLength * docRef.length);  	    // Add a Retrieval for this document to the result array	    retrievals[retrievalCount++] = new Retrieval(docRef, score);	}	// Sort the retrievals to produce a final ranked list using the	// Comparator for retrievals that produces a best to worst ordering.	Arrays.sort(retrievals);	return retrievals;    }    /** Retrieve the documents indexed by this token in the inverted index,     * add it to the retrievalHash if needed, and update its running total score.     * @param token The token in the query to incorporate.     * @param count The count of this token in the query.     * @param retrievalHash The hashtable of retrieved DocumentReferences and current     *   scores.     * @return The square of the weight of this token in the query vector for use     * in calculating the length of the query vector.     */    public double incorporateToken(String token, double count, 				   HashMap<DocumentReference,DoubleValue> retrievalHash) {	TokenInfo tokenInfo = tokenHash.get(token);	// If token is not in the index, it adds nothing and its squared weight is 0	if (tokenInfo == null) return 0.0;	// The weight of a token in the query is is IDF factor times the number	// of times it occurs in the query.	double weight = tokenInfo.idf * count;	// For each document occurrence indexed for this token...	for (TokenOccurrence occ : tokenInfo.occList) {	    // Get the current score for this document in the retrievalHash.	    DoubleValue val = retrievalHash.get(occ.docRef);	    if (val == null) {		// If this is a new retrieved document, create an initial score 		// for it and store in the retrievalHash		val = new DoubleValue(0.0);		retrievalHash.put(occ.docRef, val);	    }	    // Update the score for this document by adding the product	    // of the weight of this token in the query and its weight	    // in the retrieved document (IDF * occurrence count)	    val.value = val.value + weight * tokenInfo.idf * occ.count;	}	// Return the square of the weight of this token in the query	return weight*weight;    }    /** Enter an interactive user-query loop, accepting queries and showing the retrieved      * documents in ranked order.      */    public void processQueries() {	System.out.println("Now able to process queries. When done, enter an empty query to exit.");	// Loop indefinitely answering queries	do {	    // Get a query from the console	    String query = UserInput.prompt("\nEnter query:  ");	    // If query is empty then exit the interactive loop	    if (query.equals(""))		break;	    // Get the ranked retrievals for this query string and present them	    HashMapVector queryVector = (new TextStringDocument(query,stem)).hashMapVector();	    Retrieval[] retrievals = retrieve(queryVector);	    presentRetrievals(queryVector,retrievals);	}	while(true);    }    /** Print out a ranked set of retrievals. Show the file name and score for     * the top retrieved documents in order. Then allow user to see more or display     * individual documents.     */    public void presentRetrievals(HashMapVector queryVector, Retrieval[] retrievals) {	if (showRetrievals(retrievals)) {	    // Data structure for saving info about any user feedback for relevance feedback	    Feedback fdback = null;	    if (feedback)		fdback = new Feedback(queryVector, retrievals, this);	    // The number of the last document presented	    int currentPosition = MAX_RETRIEVALS;	    // The number of a document to be displayed.  This is one one greater than the array index	    // in retrievals, since counting for the user starts at 1 instead of 0.	    int showNumber = 0;	    // Loop accepting user commands for processing retrievals	    do {		String command = UserInput.prompt("\n Enter command:  ");		// If command is empty then exit the interactive loop		if (command.equals(""))		    break;		if (command.equals("m")) {		    // The "more" command, print a list of the next MAX_RETRIEVALS batch of retrievals		    printRetrievals(retrievals, currentPosition);		    currentPosition = currentPosition + MAX_RETRIEVALS;		    continue;		}		if (command.equals("r") && feedback) {		    // The "redo" command re-excutes a revised query using Ide_regular		    if (fdback.isEmpty()) {			System.out.println("Need to first view some documents and provide feedback.");			continue;		    }		    System.out.println("Positive docs: " + fdback.goodDocRefs + 				       "\nNegative docs: " + fdback.badDocRefs);		    System.out.println("Executing New Expanded and Reweighted Query: ");		    queryVector = fdback.newQuery();		    retrievals = retrieve(queryVector);		    // Update the list of retrievals stored in the feedback		    fdback.retrievals = retrievals;		    if (showRetrievals(retrievals))			continue;		    else 			break;		}		// See if command is a number		try {		    showNumber = Integer.parseInt(command);		}		catch (NumberFormatException e) {		    // If not a number, it is an unknown command		    System.out.println("Unknown command.");		    System.out.println("Enter `m' to see more, a number to show the nth document, nothing to exit.");		    if (feedback && !fdback.isEmpty())			System.out.println("Enter `r' to use any feedback given to `redo' with a revised query.");		    continue;		}		// Display the selected document number in Netscape 		if (showNumber > 0 && showNumber <= retrievals.length) {		    System.out.println("Showing document " + showNumber + " in the " + Browser.BROWSER_NAME + " window.");		    Browser.display(retrievals[showNumber-1].docRef.file);		    // If accepting feedback and have not rated this item, then get relevance feedback		    if (feedback && !fdback.haveFeedback(showNumber))			fdback.getFeedback(showNumber);		    continue;		}		else {		    System.out.println("No such document number: " + showNumber);		    continue;		}	    }	    while(true);	}    }    /** Show the top retrievals to the user if there are any.     * @return true if retrievals are non-empty.     */    public boolean showRetrievals(Retrieval[] retrievals) {	if (retrievals.length == 0) {	    System.out.println("\nNo matching documents found.");	    return false;	}	else {	    System.out.println("\nTop " + MAX_RETRIEVALS + " matching Documents from most to least relevant:");	    printRetrievals(retrievals, 0);	    System.out.println("\nEnter `m' to see more, a number to show the nth document, nothing to exit.");	    if (feedback)		System.out.println("Enter `r' to use any relevance feedback given to `redo' with a revised query.");	    return true;	}    }    /** Print out at most MAX_RETRIEVALS ranked retrievals starting at given starting rank number.	Include the rank number and the score.     */    public void printRetrievals(Retrieval[] retrievals, int start) {	System.out.println("");	if (start >= retrievals.length)	    System.out.println("No more retrievals.");	for(int i = start; i < Math.min(retrievals.length, start + MAX_RETRIEVALS); i++) {		    System.out.println(MoreString.padTo((i + 1) + ". ", 4) +			       MoreString.padTo(retrievals[i].docRef.file.getName(),20) + 			       " Score: " + 			       MoreMath.roundTo(retrievals[i].score, 5));	}    }    /** Index a directory of files and then interactively accept retrieval queries.     * Command format: "InvertedIndex [OPTION]* [DIR]" where DIR is the name of     * the directory whose files should be indexed, and OPTIONs can be     * "-html" to specify HTML files whose HTML tags should be removed.     * "-stem" to specify tokens should be stemmed with Porter stemmer.     * "-feedback" to allow relevance feedback from the user.     */    public static void main(String[] args) {	// Parse the arguments into a directory name and optional flag    		String dirName = args[args.length - 1];		short docType = DocumentIterator.TYPE_TEXT;	boolean stem = false, feedback = false;	for(int i = 0; i < args.length - 1; i++) {		   String flag = args[i];	   if (flag.equals("-html"))		// Create HTMLFileDocuments to filter HTML tags		docType = DocumentIterator.TYPE_HTML;	   else if (flag.equals("-stem"))	       // Stem tokens with Porter stemmer	       stem = true;	   else if (flag.equals("-feedback"))	       // Use relevance feedback	       feedback = true;	   else {	       System.out.println("\nUnknown flag: " + flag);	       System.exit(1);	   }	}				// Create an inverted index for the files in the given directory.	InvertedIndex index = new InvertedIndex(new File(dirName), docType, stem, feedback);	// index.print();	// Interactively process queries to this index.	index.processQueries();    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -