📄 .#merg26221cvs
字号:
/*
* TermAnalysis.java
*
* Created on 2006年10月30日, 下午2:01
*
* To change this template, choose Tools | Template Manager
* and open the template in the editor.
*/
package TestIndexing;
import java.io.*;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.TreeSet;
import javax.management.Query;
/**
*
* @author Alpha
*/
public class TermAnalysis {
private final int capacity = 1000;
private File source;
private Hashtable<String, Term> termTable;
private Hashtable<String, InverseTerm> inverseTermTable;
//private ArrayList<ArrayList<Term>> queriesTable;
private int numDoc;
private int numTerm;
/** Creates a new instance of TermAnalysis */
public TermAnalysis() {
this(null);
}
public TermAnalysis(File f) {
source = f;
termTable = new Hashtable<String, Term>(capacity);
inverseTermTable = new Hashtable<String, InverseTerm>(capacity);
}
public Hashtable<String, InverseTerm> getInverseTermTable(){
return this.inverseTermTable;
}
public void process() {
numDoc = 0;
numTerm = 0;
try {
FileReader fr = new FileReader(source);
BufferedReader br = new BufferedReader(fr);
String readLine;
boolean flag = false;
String lastOccurance = "";
while ((readLine = br.readLine()) != null) {
if (readLine.startsWith(".I")) {
numDoc++;
flag = false;
String[] tmp = readLine.split(" ");
lastOccurance = tmp[tmp.length-1];
continue;
} else if (readLine.startsWith(".T") || readLine.startsWith(".K") || readLine.startsWith(".W")){
flag = true;
continue;
} else if (readLine.startsWith(".")) {
flag = false;
continue;
}
if (flag) {
StringTokenizer st = new StringTokenizer(readLine, " \t\n\r\f.?!,;:()\"\'-");
while (st.hasMoreTokens()) {
String name = st.nextToken().toLowerCase();
if( stoppedWordArray.contains(name) ){
// System.out.println( "\"" + name + "\" is a stopped word");
continue;
}
if( !name.matches("[a-zA-Z]*") ){
// System.out.println("\"" + name + "\" is not a word");
continue;
}
// System.out.println( name + " is a word");
if (!termTable.containsKey(name)) {
Term newTerm = new Term(name, termTable.size());
InverseTerm newInverseTerm = new InverseTerm( newTerm.getTermID() );
newInverseTerm.addDocTime(Integer.parseInt(lastOccurance));
termTable.put(name, newTerm);
inverseTermTable.put( String.valueOf( newTerm.getTermID() ), newInverseTerm );
} else {
Term oldTerm = termTable.get(name);
InverseTerm oldInverseTerm = inverseTermTable.get( String.valueOf(oldTerm.getTermID()) );
oldTerm.addOccurance();
oldInverseTerm.addDocTime(Integer.parseInt(lastOccurance));
if (!lastOccurance.equals(oldTerm.getLastOccurance())) {
oldTerm.addOccuranceDoc();
oldTerm.setLastOccurance(lastOccurance);
}
}
numTerm++;
}
}
}
br.close();
fr.close();
} catch (FileNotFoundException ex) {
ex.printStackTrace();
} catch (IOException ex) {
ex.printStackTrace();
}
}
public void setSource(File f) {
source = f;
}
public File getSource() {
return source;
}
public int getNumDoc() {
return numDoc;
}
public int getNumTerm() {
return numTerm;
}
public int getNumUniqueTerm() {
return termTable.size();
}
public Term[] getAllTerms() {
return (Term[])termTable.values().toArray();
}
public Term[] getSortedTerms() {
SortedSet<Term> sortedTerms = new TreeSet<Term>(termTable.values());
Term[] terms = new Term[sortedTerms.size()];
sortedTerms.toArray(terms);
return terms;
}
public String getFinalResult() {
StringBuilder result = new StringBuilder("");
Term[] terms = getSortedTerms();
int occurOnce=0, occurOneDoc=0;
float zipfConstant = 0.0f;
try {
for (int i=0; i<terms.length; i++) {
if (i<=20 || i == 99 || i == 499 || i == 999 || i == 9999) {
result.append((i+1) + "\t" + terms[i].getName() + " df(" + terms[i].getOccuranceDoc() + ") and numOccurs(" + terms[i].getOccurance() + ")\n");
}
if (i<=1000) {
zipfConstant+=i*terms[i].getOccurance();
}
if (terms[i].getOccurance() == 1)
occurOnce++;
if (terms[i].getOccuranceDoc() == 1)
occurOneDoc++;
}
zipfConstant/=1000;
result.append(occurOnce + " terms occur only once.\n");
result.append(occurOneDoc + " terms occur in only one document.\n");
result.append("Zipf Constant is " + zipfConstant + ".\n");
result.append("100th term occur " + (int)zipfConstant/100 + ".\n");
result.append("500th term occur " + (int)zipfConstant/500 + ".\n");
result.append("1000th term occur " + (int)zipfConstant/1000 + ".\n");
result.append("10000th term occur " + (int)zipfConstant/10000 + ".\n");
} catch(ArrayIndexOutOfBoundsException ex) {
result.append("Array out of bounds");
}
return result.toString();
}
public void setStoppedWord(File f) {
this.stoppedWord = f;
}
public File getStoppedWord( ){
return this.stoppedWord;
}
private File stoppedWord = null;
private ArrayList<String> stoppedWordArray = new ArrayList<String>();
public void processStoppedWordFile() throws FileNotFoundException {
FileReader fr = null;
try {
fr = new FileReader(stoppedWord);
BufferedReader br = new BufferedReader(fr);
String readLine;
while ((readLine = br.readLine()) != null) {
//System.out.println( "reading: " + readLine );
if( readLine.compareTo("")!=0 ){
stoppedWordArray.add( readLine.toLowerCase() );
}
}
fr.close();
br.close();
} catch (FileNotFoundException ex) {
throw ex;
} catch (IOException ex) {
ex.printStackTrace();
}
// ListIterator<String> li = stoppedWordArray.listIterator();
// while( li.hasNext() ){
// System.out.println( li.next() );
// }
}
public String writeDict(String fileName) {
File dict = new File(fileName);
FileWriter fw = null;
BufferedWriter bw = null;
try {
fw = new FileWriter(dict);
bw = new BufferedWriter(fw);
Term[] terms = getSortedTerms();
for (int i = 0; i < terms.length; i++) {
bw.append(terms[i].toString());
}
bw.close();
fw.close();
} catch (IOException ex) {
ex.printStackTrace();
}
return dict.getAbsolutePath();
}
public String writeInvDoc(String fileName) {
File invDoc = new File(fileName);
FileWriter fw = null;
BufferedWriter bw = null;
try {
fw = new FileWriter(invDoc);
bw = new BufferedWriter(fw);
Term[] terms = getSortedTerms();
for (int i = 0; i < terms.length; i++) {
InverseTerm it = inverseTermTable.get(String.valueOf(terms[i].getTermID()));
bw.append(it.toString());
}
bw.close();
fw.close();
} catch (IOException ex) {
ex.printStackTrace();
}
return invDoc.getAbsolutePath();
}
public ArrayList<ArrayList<Term>> processQuery(File file){
ArrayList<ArrayList<Term>> queries = new ArrayList<ArrayList<Term>>();
boolean flag = false;
// int lastDoc = 0;
try {
FileReader fr = new FileReader(source);
BufferedReader br = new BufferedReader(fr);
String readLine;
ArrayList<Term> query = null;
while ((readLine = br.readLine()) != null) {
if (readLine.startsWith(".I ")) {
flag = false;
// String[] tmp = readLine.split(" ");
// lastDoc = Integer.parseInt(tmp[tmp.length-1]);
query = new ArrayList<Term>();
queries.add(query);
continue;
} else if ( readLine.startsWith(".W") || readLine.startsWith(".A") ) {
flag = true;
continue;
} else if (readLine.startsWith(".")) {
flag = false;
continue;
}
if (flag) {
StringTokenizer st = new StringTokenizer(readLine, " \t\n\r\f.?!,;:()\"\'-");
while (st.hasMoreTokens()) {
String name = st.nextToken().toLowerCase();
if( stoppedWordArray.contains(name) ){
// System.out.println( "\"" + name + "\" is a stopped word");
continue;
}
if( !name.matches("[a-zA-Z]*") ){
// System.out.println("\"" + name + "\" is not a word");
continue;
}
// if (termTable.get(name) == null) {
// continue;
// }
Term term = new Term(name);
addTerm(query, term);
// System.out.println( name + " is a word");
}
}
}
br.close();
fr.close();
}catch(Exception e){
e.printStackTrace();
}
return queries;
}
private void addTerm(ArrayList<Term> query, Term term) {
if (query == null) return;
int len = query.size();
for (int i = 0; i < len; i++) {
if (query.get(i).getName().equals(term.getName())) {
query.get(i).addOccurance();
return;
}
}
query.add(term);
}
public String printQueries(ArrayList<ArrayList<Term>> queries) {
StringBuilder sb = new StringBuilder();
int len = queries.size();
for (int i = 0; i < len; i++) {
sb.append("Query " + (i+1) + ":\n");
int length = queries.get(i).size();
for (int j = 0; j < length; j++) {
sb.append(queries.get(i).get(j).getName()+"["+queries.get(i).get(j).getOccurance()+"]+");
}
sb.append("\n");
}
return sb.toString();
}
/**
* Holds value of property queries.
*/
private java.io.File queries;
/**
* Getter for property queries.
* @return Value of property queries.
*/
public java.io.File getQueries() {
return this.queries;
}
/**
* Setter for property queries.
* @param queries New value of property queries.
*/
public void setQueries(java.io.File queries) {
this.queries = queries;
}
public InverseTerm startQuery(){
if( this.inverseTermTable.isEmpty() ){
return null;
}
ArrayList<ArrayList<Term>> queries = this.processQuery( this.getQueries() );
if( queries == null || queries.isEmpty() ){
System.out.println("queries is empty");
return null;
}
InverseTerm result = new InverseTerm();
int len = queries.size();
for (int i = 0; i < len; i++) { // for each query
int len2 = queries.get(i).size();
for (int j = 0; j < len2; j++) { // for each term in a query
Term term = queries.get(i).get(j);
String termID = String.valueOf(term.getTermID());
if( inverseTermTable.containsKey( termID ) ){ //when the term occured in the doc
// put all the inverse terms in the result
InverseTerm it = inverseTermTable.get( termID );
ArrayList<int[]> ali = it.getDocTimes();
// put all of them into result, and calculate if repeat
result.plusDocTime( ali, term.getOccurance() );
} else{
// nothing..to ..do..-_-
}
}
}
return result;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -