📄 invertedindex.java
字号:
double count = entry.getValue().getValue(); // Determine the score added to the similarity of each document // indexed under this token and update the length of the // query vector with the square of the weight for this token. queryLength = queryLength + incorporateToken(token, count, retrievalHash); } // Finalize the length of the query vector by taking the square-root of the // final sum of squares of its token wieghts. queryLength = Math.sqrt(queryLength); // Make an array to store the final ranked Retrievals. Retrieval[] retrievals = new Retrieval[retrievalHash.size()]; // Iterate through each of the retreived docuements stored in // the final retrievalHash. int retrievalCount = 0; for (Map.Entry<DocumentReference,DoubleValue> entry : retrievalHash.entrySet()) { DocumentReference docRef = entry.getKey(); double score = entry.getValue().value; // Normalize score for the lengths of the two document vectors score = score / (queryLength * docRef.length); // Add a Retrieval for this document to the result array retrievals[retrievalCount++] = new Retrieval(docRef, score); } // Sort the retrievals to produce a final ranked list using the // Comparator for retrievals that produces a best to worst ordering. Arrays.sort(retrievals); return retrievals; } /** Retrieve the documents indexed by this token in the inverted index, * add it to the retrievalHash if needed, and update its running total score. * @param token The token in the query to incorporate. * @param count The count of this token in the query. * @param retrievalHash The hashtable of retrieved DocumentReferences and current * scores. * @return The square of the weight of this token in the query vector for use * in calculating the length of the query vector. */ public double incorporateToken(String token, double count, HashMap<DocumentReference,DoubleValue> retrievalHash) { TokenInfo tokenInfo = tokenHash.get(token); // If token is not in the index, it adds nothing and its squared weight is 0 if (tokenInfo == null) return 0.0; // The weight of a token in the query is is IDF factor times the number // of times it occurs in the query. double weight = tokenInfo.idf * count; // For each document occurrence indexed for this token... for (TokenOccurrence occ : tokenInfo.occList) { // Get the current score for this document in the retrievalHash. DoubleValue val = retrievalHash.get(occ.docRef); if (val == null) { // If this is a new retrieved document, create an initial score // for it and store in the retrievalHash val = new DoubleValue(0.0); retrievalHash.put(occ.docRef, val); } // Update the score for this document by adding the product // of the weight of this token in the query and its weight // in the retrieved document (IDF * occurrence count) val.value = val.value + weight * tokenInfo.idf * occ.count; } // Return the square of the weight of this token in the query return weight*weight; } /** Enter an interactive user-query loop, accepting queries and showing the retrieved * documents in ranked order. */ public void processQueries() { System.out.println("Now able to process queries. When done, enter an empty query to exit."); // Loop indefinitely answering queries do { // Get a query from the console String query = UserInput.prompt("\nEnter query: "); // If query is empty then exit the interactive loop if (query.equals("")) break; // Get the ranked retrievals for this query string and present them HashMapVector queryVector = (new TextStringDocument(query,stem)).hashMapVector(); Retrieval[] retrievals = retrieve(queryVector); presentRetrievals(queryVector,retrievals); } while(true); } /** Print out a ranked set of retrievals. Show the file name and score for * the top retrieved documents in order. Then allow user to see more or display * individual documents. */ public void presentRetrievals(HashMapVector queryVector, Retrieval[] retrievals) { if (showRetrievals(retrievals)) { // Data structure for saving info about any user feedback for relevance feedback Feedback fdback = null; if (feedback) fdback = new Feedback(queryVector, retrievals, this); // The number of the last document presented int currentPosition = MAX_RETRIEVALS; // The number of a document to be displayed. This is one one greater than the array index // in retrievals, since counting for the user starts at 1 instead of 0. int showNumber = 0; // Loop accepting user commands for processing retrievals do { String command = UserInput.prompt("\n Enter command: "); // If command is empty then exit the interactive loop if (command.equals("")) break; if (command.equals("m")) { // The "more" command, print a list of the next MAX_RETRIEVALS batch of retrievals printRetrievals(retrievals, currentPosition); currentPosition = currentPosition + MAX_RETRIEVALS; continue; } if (command.equals("r") && feedback) { // The "redo" command re-excutes a revised query using Ide_regular if (fdback.isEmpty()) { System.out.println("Need to first view some documents and provide feedback."); continue; } System.out.println("Positive docs: " + fdback.goodDocRefs + "\nNegative docs: " + fdback.badDocRefs); System.out.println("Executing New Expanded and Reweighted Query: "); queryVector = fdback.newQuery(); retrievals = retrieve(queryVector); // Update the list of retrievals stored in the feedback fdback.retrievals = retrievals; if (showRetrievals(retrievals)) continue; else break; } // See if command is a number try { showNumber = Integer.parseInt(command); } catch (NumberFormatException e) { // If not a number, it is an unknown command System.out.println("Unknown command."); System.out.println("Enter `m' to see more, a number to show the nth document, nothing to exit."); if (feedback && !fdback.isEmpty()) System.out.println("Enter `r' to use any feedback given to `redo' with a revised query."); continue; } // Display the selected document number in Netscape if (showNumber > 0 && showNumber <= retrievals.length) { System.out.println("Showing document " + showNumber + " in the " + Browser.BROWSER_NAME + " window."); Browser.display(retrievals[showNumber-1].docRef.file); // If accepting feedback and have not rated this item, then get relevance feedback if (feedback && !fdback.haveFeedback(showNumber)) fdback.getFeedback(showNumber); continue; } else { System.out.println("No such document number: " + showNumber); continue; } } while(true); } } /** Show the top retrievals to the user if there are any. * @return true if retrievals are non-empty. */ public boolean showRetrievals(Retrieval[] retrievals) { if (retrievals.length == 0) { System.out.println("\nNo matching documents found."); return false; } else { System.out.println("\nTop " + MAX_RETRIEVALS + " matching Documents from most to least relevant:"); printRetrievals(retrievals, 0); System.out.println("\nEnter `m' to see more, a number to show the nth document, nothing to exit."); if (feedback) System.out.println("Enter `r' to use any relevance feedback given to `redo' with a revised query."); return true; } } /** Print out at most MAX_RETRIEVALS ranked retrievals starting at given starting rank number. Include the rank number and the score. */ public void printRetrievals(Retrieval[] retrievals, int start) { System.out.println(""); if (start >= retrievals.length) System.out.println("No more retrievals."); for(int i = start; i < Math.min(retrievals.length, start + MAX_RETRIEVALS); i++) { System.out.println(MoreString.padTo((i + 1) + ". ", 4) + MoreString.padTo(retrievals[i].docRef.file.getName(),20) + " Score: " + MoreMath.roundTo(retrievals[i].score, 5)); } } /** Index a directory of files and then interactively accept retrieval queries. * Command format: "InvertedIndex [OPTION]* [DIR]" where DIR is the name of * the directory whose files should be indexed, and OPTIONs can be * "-html" to specify HTML files whose HTML tags should be removed. * "-stem" to specify tokens should be stemmed with Porter stemmer. * "-feedback" to allow relevance feedback from the user. */ public static void main(String[] args) { // Parse the arguments into a directory name and optional flag String dirName = args[args.length - 1]; short docType = DocumentIterator.TYPE_TEXT; boolean stem = false, feedback = false; for(int i = 0; i < args.length - 1; i++) { String flag = args[i]; if (flag.equals("-html")) // Create HTMLFileDocuments to filter HTML tags docType = DocumentIterator.TYPE_HTML; else if (flag.equals("-stem")) // Stem tokens with Porter stemmer stem = true; else if (flag.equals("-feedback")) // Use relevance feedback feedback = true; else { System.out.println("\nUnknown flag: " + flag); System.exit(1); } } // Create an inverted index for the files in the given directory. InvertedIndex index = new InvertedIndex(new File(dirName), docType, stem, feedback); // index.print(); // Interactively process queries to this index. index.processQueries(); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -