📄 document.java

📁 这是一个用于测试用的搜索引擎的案例
💻 JAVA
字号:
package ir.vsr;import java.io.*;import java.util.*;import ir.utilities.*;/** * Docment is an abstract class that provides for tokenization * of a document with stop-word removal and an iterator-like interface * similar to  StringTokenizer. * Also provides a method for converting a document into a  * vector-space bag-of-words in the form of a HashMap of  * tokens and their occurrence counts. * * @author Ray Mooney */public abstract class Document {    /** The file where a list of stopwords, 1 per line, are stored */    protected static final String stopWordsFile = "d:\\stopwords.txt";    /** The number of stopwords in this file */    protected static final int numStopWords = 514;    /** The hashtable where stopwords are indexed */    protected static HashSet<String> stopWords = null;    /** The Porter stemmer */    protected static Porter stemmer = new Porter();        /** The next token in the document */    protected String nextToken = null;    /** The number of tokens currently read from document */    protected int numTokens = 0;    /** Whether to stem tokens with the Porter stemmer */    protected boolean stem = false;    /** Creates a new Document making sure that the stopwords     * are loaded, indexed, and ready for use.  Subclasses     * that create concrete instances MUST call prepareNextToken     * before finishing to ensure that the first token is precomputed     * and available.     */    public Document(boolean stem) {	this.stem = stem;	if (stopWords == null) 	    loadStopWords();    }    /** Returns true iff the document contains more tokens */    public boolean hasMoreTokens() {	if (nextToken == null)	    return false;	else	    return true;    }    /** Returns the next token in the document or null if there are none */    public String nextToken() {	String token = nextToken;	if (token == null) return null;	prepareNextToken();	numTokens++;	return token;    }    /** The nextToken slot is always precomputed and stored by this method.     * Performs stop-word removal of candidate tokens. */    protected void prepareNextToken () {	// Loop until a non-stopword token is found	do {	    nextToken = getNextCandidateToken();	    if (nextToken == null) return; // reached end of document	    // Normalize token string case to lower case.	    nextToken = nextToken.toLowerCase();	    // Do not include a token found in the stopword list as	    // indexed in the stopwords hashtable.	    // Also do not include tokens that are not all Unicode letters	    if (stopWords.contains(nextToken) || !allLetters(nextToken))		nextToken = null;	    else if (stem) {		nextToken = stemmer.stripAffixes(nextToken);		if (stopWords.contains(nextToken))		    nextToken = null;	    }	}	while (nextToken == null);    }    /** Check if this token consists of all Unicode letters to eliminate     *  other bizarre tokens */    protected boolean allLetters(String token) {	for(int i = 0; i < token.length(); i++) {	    if (!Character.isLetter(token.charAt(i)))		return false;	}	return true;    }    /** Return the next possible token in the document. Each subclass must implement     * this method to produce candidate tokens for subsequent stop-word filtering.     */    protected abstract String getNextCandidateToken();    /** Returns the total number of tokens in the document or -1 if     *  there are still more tokens to be read and the total count is not yet available.     */    public int numberOfTokens() {	if (nextToken == null)	    return numTokens;	else	    return -1;    }    /** Load the stopwords from file to the hashtable where they are indexed. */    protected static void loadStopWords() {	// Initialize hashtable to proper size given known number of	// stopwords in the file and a default 75% load factor with	// 10 extra slots for spare room.	int HashMapSize = (int)(numStopWords/0.75 + 10);	stopWords = new HashSet<String>(HashMapSize);	String line;	try {	    // Open stopword file for reading	    BufferedReader in = new BufferedReader(new FileReader(stopWordsFile));	    // Read in stopwords, one per line, until file is empty	    while ((line = in.readLine()) != null) {		// Index word into the hashtable with 		// the default empty string as a "dummy" value.		stopWords.add(line);	    }	    in.close();	}	catch (IOException e) {	    System.out.println("\nCould not load stopwords file: " + stopWordsFile);	    System.exit(1);	}    }    /** Returns a hashmap version of the term-vector (bag of words) for this     * document, where each token is a key whose value is the number of times      * it occurs in the document as stored in a Weight.     * @see Weight     */    public HashMapVector hashMapVector () {	if (numTokens != 0)	    return null;	HashMapVector vector = new HashMapVector();	// Process each token in the document and add it to the vector	while (hasMoreTokens()) {	    String token = nextToken();	    vector.increment(token);	}	return vector;    }    /** Compute and print out (one line per term) the term-vector (bag of words)      * for this document      */    public void printVector() {	hashMapVector().print();    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -