invertedindex.java

来自「一个使用的搜索引擎」· Java 代码 · 共 489 行 · 第 1/2 页
JAVA
489 行
package ir.vsr;import java.io.*;import java.util.*;import java.lang.*;import ir.utilities.*;import ir.classifiers.*;/** * An inverted index for vector-space information retrieval. Contains * methods for creating an inverted index from a set of documents * and retrieving ranked matches to queries using standard TF/IDF * weighting and cosine similarity. * * @author Ray Mooney */public class InvertedIndex{    /** The maximum number of retrieved documents for a query to present to the user     * at a time */    public static final int MAX_RETRIEVALS = 10;    /** A HashMap where tokens are indexed. Each indexed token maps     * to a TokenInfo. */    public HashMap tokenHash = null;    /** A list of all indexed documents.  Elements are DocumentReference's. */    public ArrayList docRefs = null;    /** The directory from which the indexed documents come. */    public File dirFile = null;    /** The type of Documents (text, HTML). See docType in DocumentIterator. */    public short docType = DocumentIterator.TYPE_TEXT;    /** Whether tokens should be stemmed with Porter stemmer */    public boolean stem = false;    /** Whether relevance feedback using the Ide_regular algorithm is used */    public boolean feedback = false;    /** Create an inverted index of the documents in a directory.     * @param dirFile The directory of files to index.     * @param docType The type of documents to index (See docType in DocumentIterator)      * @param stem Whether tokens should be stemmed with Porter stemmer.     * @param feedback Whether relevance feedback should be used.     */    public InvertedIndex(File dirFile, short docType, boolean stem, boolean feedback) {	this.dirFile = dirFile;	this.docType = docType;	this.stem = stem;	this.feedback = feedback;	tokenHash = new HashMap();	docRefs = new ArrayList();	indexDocuments();    }    /** Create an inverted index of the documents in a directory.     * @param examples A List containing the Example objects for text categorization to index     */    public InvertedIndex(List examples) {	tokenHash = new HashMap();	docRefs = new ArrayList();	indexDocuments(examples);    }    /** Index the documents in dirFile. */    protected void indexDocuments() {	if (!tokenHash.isEmpty() || !docRefs.isEmpty()) {	    // Currently can only index one set of documents when an index is created	    System.out.println("\nCannot indexDocuments more than once in the same InvertedIndex");	    System.exit(1);	}	    	// Get an iterator for the documents	DocumentIterator docIter = new DocumentIterator(dirFile, docType, stem);	System.out.println("Indexing documents in " + dirFile);	// Loop, processing each of the documents	while (docIter.hasMoreDocuments()) {	    FileDocument doc = docIter.nextDocument();	    // Create a document vector for this document	    HashMapVector vector = doc.hashMapVector();	    indexDocument(doc, vector);	}	// Now that all documents have been processed, we can calculate the IDF weights for	// all tokens and the resulting lengths of all weighted document vectors.	computeIDFandDocumentLengths();	System.out.println("Indexed " +  docRefs.size() + " documents with " + size() + " unique terms.");    }    /** Index the documents in the List of Examples for text categorization. */    public void indexDocuments(List examples) {	if (!tokenHash.isEmpty() || !docRefs.isEmpty()) {	    // Currently can only index one set of documents when an index is created	    System.out.println("\nCannot indexDocuments more than once in the same InvertedIndex");	    System.exit(1);	}	    	// Loop, processing each of the examples	Iterator exampleIterator = examples.iterator();	while (exampleIterator.hasNext()) {	    Example example = (Example)exampleIterator.next();	    FileDocument doc = example.getDocument();	    // Create a document vector for this document	    HashMapVector vector = example.getHashMapVector();	    indexDocument(doc, vector);	}	// Now that all documents have been processed, we can calculate the IDF weights for	// all tokens and the resulting lengths of all weighted document vectors.	computeIDFandDocumentLengths();	System.out.println("Indexed " +  docRefs.size() + " documents with " + size() + " unique terms.");    }    /** Index the given document using its corresponding vector */    protected void indexDocument(FileDocument doc, HashMapVector vector) {	// Create a reference to this document	DocumentReference docRef = new DocumentReference(doc);	// Add this document to the list of documents indexed	docRefs.add(docRef);	// Iterate through each of the tokens in the document	Iterator mapEntries = vector.iterator();	while (mapEntries.hasNext()) {	    Map.Entry entry = (Map.Entry)mapEntries.next();	    // An entry in the HashMap maps a token to a Weight	    String token = (String)entry.getKey();	    // The count for the token is in the value of the Weight	    int count = (int)((Weight)entry.getValue()).getValue();	    // Add an occurence of this token to the inverted index pointing to this document	    indexToken(token, count, docRef);	}    }    /** Add a token occurrence to the index.     * @param token The token to index.     * @param count The number of times it occurs in the document.     * @param docRef A reference to the Document it occurs in.     */    protected void indexToken(String token, int count, DocumentReference docRef) {	// Find this token in the index	TokenInfo tokenInfo = (TokenInfo)tokenHash.get(token);	if (tokenInfo == null) {	    // If this is a new token, create info for it to put in the hashtable	    tokenInfo = new TokenInfo();	    tokenHash.put(token, tokenInfo);	}	// Add a new occurrence for this token to its info	tokenInfo.occList.add(new TokenOccurrence(docRef, count));    }    /** Compute the IDF factor for every token in the index and the length     * of the document vector for every document referenced in the index. */    protected void computeIDFandDocumentLengths() {	// Let N be the total number of documents indexed	double N = docRefs.size();	// Iterate through each of the tokens in the index 	Iterator mapEntries = tokenHash.entrySet().iterator();	while (mapEntries.hasNext()) {	    // Get the token and the tokenInfo for each entry in the HashMap	    Map.Entry entry = (Map.Entry)mapEntries.next();	    String token = (String)entry.getKey();	    TokenInfo tokenInfo = (TokenInfo)entry.getValue();	    // Get the total number of documents in which this token occurs	    double numDocRefs = tokenInfo.occList.size(); 	    // Calculate the IDF factor for this token	    double idf = Math.log(N/numDocRefs);	    //  System.out.println(token + " occurs in " + Math.round(numDocRefs) + " docs so IDF=" + idf);	    if (idf == 0.0) 		// If IDF is 0, then just remove this inconsequential token from the index		mapEntries.remove();	    else {		tokenInfo.idf = idf;		// In order to compute document vector lengths,  sum the		// square of the weights (IDF * occurrence count) across		// every token occurrence for each document.		for(int i = 0; i < tokenInfo.occList.size(); i++) {		    TokenOccurrence occ = (TokenOccurrence)tokenInfo.occList.get(i);		    occ.docRef.length = occ.docRef.length + Math.pow(idf*occ.count, 2);		}	    }	}	// At this point, every document length should be the sum of the squares of	// its token weights.  In order to calculate final lengths, just need to	// set the length of every document reference to the square-root of this sum.	for(int i = 0; i < docRefs.size(); i++) {	    DocumentReference docRef = (DocumentReference)docRefs.get(i);	    docRef.length = Math.sqrt(docRef.length);	}    }    /** Print out an inverted index by listing each token and the documents it occurs in.     * Include info on IDF factors, occurrence counts, and document vector lengths. */    public void print() {	// Iterate through each token in the index	Iterator mapEntries = tokenHash.entrySet().iterator();	while (mapEntries.hasNext()) {	    // Get the token and the tokenInfo for each entry in the HashMap	    Map.Entry entry = (Map.Entry)mapEntries.next();	    String token = (String)entry.getKey();	    // Print the token and its IDF factor	    System.out.println(token + " (IDF=" + ((TokenInfo)entry.getValue()).idf + ") occurs in:");	    ArrayList occList = ((TokenInfo)entry.getValue()).occList;	    // For each document referenced, print its name, occurrence count for this token, and 	    // document vector length (|D|).	    for(int i = 0; i < occList.size(); i++) {		TokenOccurrence occ = (TokenOccurrence)occList.get(i);		System.out.println("   " + occ.docRef.file.getName() + " " + occ.count + 				   " times; |D|=" + occ.docRef.length);	    }	}    }    /** Return the number of tokens indexed. */    public int size() {	return tokenHash.size();    }    /** Clear all documents from the inverted index */    public void clear() {	docRefs.clear();	tokenHash.clear();    }    /** Perform ranked retrieval on this input query. */    public Retrieval[] retrieve(String input) {	return retrieve(new TextStringDocument(input,stem));    }    /** Perform ranked retrieval on this input query Document. */    public Retrieval[] retrieve(Document doc) {	return retrieve(doc.hashMapVector());    }    /** Perform ranked retrieval on this input query Document vector. */    public Retrieval[] retrieve(HashMapVector vector) {	// Create a hashtable to store the retrieved documents.  Keys	// are docRefs and values are DoubleValues which indicate the	// partial score accumulated for this document so far.	// As each token in the query is processed, each document	// it indexes is added to this hashtable and its retrieval	// score (similarity to the query) is appropriately updated.	HashMap retrievalHash = new HashMap();	// Initialize a variable to store the length of the query vector	double queryLength = 0.0;	// Iterate through each token in the query input Document
invertedindex.java - 源码说明

本页面展示了「一个使用的搜索引擎」中的 invertedindex.java 源码文件，采用 Java 编程语言编写，共 489 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与搜索引擎相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?