📄 invertedindex.java
字号:
package ir.vsr;import java.io.*;import java.util.*;import java.lang.*;import ir.utilities.*;import ir.classifiers.*;/** * An inverted index for vector-space information retrieval. Contains * methods for creating an inverted index from a set of documents * and retrieving ranked matches to queries using standard TF/IDF * weighting and cosine similarity. * * @author Ray Mooney */public class InvertedIndex{ /** The maximum number of retrieved documents for a query to present to the user * at a time */ public static final int MAX_RETRIEVALS = 10; /** A HashMap where tokens are indexed. Each indexed token maps * to a TokenInfo. */ public HashMap tokenHash = null; /** A list of all indexed documents. Elements are DocumentReference's. */ public ArrayList docRefs = null; /** The directory from which the indexed documents come. */ public File dirFile = null; /** The type of Documents (text, HTML). See docType in DocumentIterator. */ public short docType = DocumentIterator.TYPE_TEXT; /** Whether tokens should be stemmed with Porter stemmer */ public boolean stem = false; /** Whether relevance feedback using the Ide_regular algorithm is used */ public boolean feedback = false; /** Create an inverted index of the documents in a directory. * @param dirFile The directory of files to index. * @param docType The type of documents to index (See docType in DocumentIterator) * @param stem Whether tokens should be stemmed with Porter stemmer. * @param feedback Whether relevance feedback should be used. */ public InvertedIndex(File dirFile, short docType, boolean stem, boolean feedback) { this.dirFile = dirFile; this.docType = docType; this.stem = stem; this.feedback = feedback; tokenHash = new HashMap(); docRefs = new ArrayList(); indexDocuments(); } /** Create an inverted index of the documents in a directory. * @param examples A List containing the Example objects for text categorization to index */ public InvertedIndex(List examples) { tokenHash = new HashMap(); docRefs = new ArrayList(); indexDocuments(examples); } /** Index the documents in dirFile. */ protected void indexDocuments() { if (!tokenHash.isEmpty() || !docRefs.isEmpty()) { // Currently can only index one set of documents when an index is created System.out.println("\nCannot indexDocuments more than once in the same InvertedIndex"); System.exit(1); } // Get an iterator for the documents DocumentIterator docIter = new DocumentIterator(dirFile, docType, stem); System.out.println("Indexing documents in " + dirFile); // Loop, processing each of the documents while (docIter.hasMoreDocuments()) { FileDocument doc = docIter.nextDocument(); // Create a document vector for this document HashMapVector vector = doc.hashMapVector(); indexDocument(doc, vector); } // Now that all documents have been processed, we can calculate the IDF weights for // all tokens and the resulting lengths of all weighted document vectors. computeIDFandDocumentLengths(); System.out.println("Indexed " + docRefs.size() + " documents with " + size() + " unique terms."); } /** Index the documents in the List of Examples for text categorization. */ public void indexDocuments(List examples) { if (!tokenHash.isEmpty() || !docRefs.isEmpty()) { // Currently can only index one set of documents when an index is created System.out.println("\nCannot indexDocuments more than once in the same InvertedIndex"); System.exit(1); } // Loop, processing each of the examples Iterator exampleIterator = examples.iterator(); while (exampleIterator.hasNext()) { Example example = (Example)exampleIterator.next(); FileDocument doc = example.getDocument(); // Create a document vector for this document HashMapVector vector = example.getHashMapVector(); indexDocument(doc, vector); } // Now that all documents have been processed, we can calculate the IDF weights for // all tokens and the resulting lengths of all weighted document vectors. computeIDFandDocumentLengths(); System.out.println("Indexed " + docRefs.size() + " documents with " + size() + " unique terms."); } /** Index the given document using its corresponding vector */ protected void indexDocument(FileDocument doc, HashMapVector vector) { // Create a reference to this document DocumentReference docRef = new DocumentReference(doc); // Add this document to the list of documents indexed docRefs.add(docRef); // Iterate through each of the tokens in the document Iterator mapEntries = vector.iterator(); while (mapEntries.hasNext()) { Map.Entry entry = (Map.Entry)mapEntries.next(); // An entry in the HashMap maps a token to a Weight String token = (String)entry.getKey(); // The count for the token is in the value of the Weight int count = (int)((Weight)entry.getValue()).getValue(); // Add an occurence of this token to the inverted index pointing to this document indexToken(token, count, docRef); } } /** Add a token occurrence to the index. * @param token The token to index. * @param count The number of times it occurs in the document. * @param docRef A reference to the Document it occurs in. */ protected void indexToken(String token, int count, DocumentReference docRef) { // Find this token in the index TokenInfo tokenInfo = (TokenInfo)tokenHash.get(token); if (tokenInfo == null) { // If this is a new token, create info for it to put in the hashtable tokenInfo = new TokenInfo(); tokenHash.put(token, tokenInfo); } // Add a new occurrence for this token to its info tokenInfo.occList.add(new TokenOccurrence(docRef, count)); } /** Compute the IDF factor for every token in the index and the length * of the document vector for every document referenced in the index. */ protected void computeIDFandDocumentLengths() { // Let N be the total number of documents indexed double N = docRefs.size(); // Iterate through each of the tokens in the index Iterator mapEntries = tokenHash.entrySet().iterator(); while (mapEntries.hasNext()) { // Get the token and the tokenInfo for each entry in the HashMap Map.Entry entry = (Map.Entry)mapEntries.next(); String token = (String)entry.getKey(); TokenInfo tokenInfo = (TokenInfo)entry.getValue(); // Get the total number of documents in which this token occurs double numDocRefs = tokenInfo.occList.size(); // Calculate the IDF factor for this token double idf = Math.log(N/numDocRefs); // System.out.println(token + " occurs in " + Math.round(numDocRefs) + " docs so IDF=" + idf); if (idf == 0.0) // If IDF is 0, then just remove this inconsequential token from the index mapEntries.remove(); else { tokenInfo.idf = idf; // In order to compute document vector lengths, sum the // square of the weights (IDF * occurrence count) across // every token occurrence for each document. for(int i = 0; i < tokenInfo.occList.size(); i++) { TokenOccurrence occ = (TokenOccurrence)tokenInfo.occList.get(i); occ.docRef.length = occ.docRef.length + Math.pow(idf*occ.count, 2); } } } // At this point, every document length should be the sum of the squares of // its token weights. In order to calculate final lengths, just need to // set the length of every document reference to the square-root of this sum. for(int i = 0; i < docRefs.size(); i++) { DocumentReference docRef = (DocumentReference)docRefs.get(i); docRef.length = Math.sqrt(docRef.length); } } /** Print out an inverted index by listing each token and the documents it occurs in. * Include info on IDF factors, occurrence counts, and document vector lengths. */ public void print() { // Iterate through each token in the index Iterator mapEntries = tokenHash.entrySet().iterator(); while (mapEntries.hasNext()) { // Get the token and the tokenInfo for each entry in the HashMap Map.Entry entry = (Map.Entry)mapEntries.next(); String token = (String)entry.getKey(); // Print the token and its IDF factor System.out.println(token + " (IDF=" + ((TokenInfo)entry.getValue()).idf + ") occurs in:"); ArrayList occList = ((TokenInfo)entry.getValue()).occList; // For each document referenced, print its name, occurrence count for this token, and // document vector length (|D|). for(int i = 0; i < occList.size(); i++) { TokenOccurrence occ = (TokenOccurrence)occList.get(i); System.out.println(" " + occ.docRef.file.getName() + " " + occ.count + " times; |D|=" + occ.docRef.length); } } } /** Return the number of tokens indexed. */ public int size() { return tokenHash.size(); } /** Clear all documents from the inverted index */ public void clear() { docRefs.clear(); tokenHash.clear(); } /** Perform ranked retrieval on this input query. */ public Retrieval[] retrieve(String input) { return retrieve(new TextStringDocument(input,stem)); } /** Perform ranked retrieval on this input query Document. */ public Retrieval[] retrieve(Document doc) { return retrieve(doc.hashMapVector()); } /** Perform ranked retrieval on this input query Document vector. */ public Retrieval[] retrieve(HashMapVector vector) { // Create a hashtable to store the retrieved documents. Keys // are docRefs and values are DoubleValues which indicate the // partial score accumulated for this document so far. // As each token in the query is processed, each document // it indexes is added to this hashtable and its retrieval // score (similarity to the query) is appropriately updated. HashMap retrievalHash = new HashMap(); // Initialize a variable to store the length of the query vector double queryLength = 0.0; // Iterate through each token in the query input Document
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -