📄 topicsignaturemodel.java
字号:
package dragon.ir.kngbase;
import dragon.ir.index.*;
import dragon.nlp.*;
import dragon.matrix.*;
import dragon.util.*;
import java.io.File;
import java.util.*;
/**
* <p>A program for topic signature model estimation</p>
* <p>This program finds a set of weighted terms to represent the semantics of a topic signature. A topic signature
* can be anything here including multiword phrases, concept pairs, and individual terms. There are two approaches
* to thee model estimation. One is the maximum likelihood estimator. The other uses EM algoritm if one can provide
* the disbribution of individual terms on a corpus. See more details about the EM algorithm in our previous work.<br><br>
* Zhou, X., Zhang, X., and Hu, X., “Semantic Smoothing of Document Models for Agglomerative Clustering,”
* In the Twentieth International Joint Conference on Artificial Intelligence(IJCAI 07), Hyderabad, India, Jan 6-12,
* 2007, pp. 2928-2933<br><br>One should provide indices for topic signatures and individual terms, respectively, or
* give the occurrence matrix of topic signatures and individual terms.</p>
* <p>Copyright: Copyright (c) 2005</p>
* <p>Company: IST, Drexel University</p>
* @author Davis Zhou
* @version 1.0
*/
public class TopicSignatureModel{
private IRSignatureIndexList srcIndexList;
private IRSignatureIndexList destIndexList;
private IntSparseMatrix srcSignatureDocMatrix;
private IntSparseMatrix destDocSignatureMatrix;
private IntSparseMatrix cooccurMatrix;
private boolean useDocFrequency;
private boolean useMeanTrim;
private boolean useEM;
private double probThreshold;
private double bkgCoeffi;
private int[] buf;
private int iterationNum;
private int totalDestSignatureNum;
private int DOC_THRESH;
/**
* The constructor for the mode of the maximum likelihood estimator
* @param srcIndexList the statisitcs of topic signatures in the collection.
* @param srcSignatureDocMatrix the doc-term matrix for topic signatures.
* @param destDocSignatureMatrix the doc-term matrix for individual terms topic signatures will translate to.
*/
public TopicSignatureModel(IRSignatureIndexList srcIndexList, IntSparseMatrix srcSignatureDocMatrix, IntSparseMatrix destDocSignatureMatrix) {
this.srcIndexList =srcIndexList;
this.srcSignatureDocMatrix=srcSignatureDocMatrix;
this.destDocSignatureMatrix=destDocSignatureMatrix;
useDocFrequency=true;
useMeanTrim=true;
probThreshold=0.001;
useEM=false;
iterationNum =15;
bkgCoeffi =0.5;
totalDestSignatureNum=destDocSignatureMatrix.columns();
}
/**
* The constructor for the mode of the maximum likelihood estimator
* @param srcIndexList the statisitcs of topic signatures in the collection.
* @param cooccurMatrix the cooccurence matrix of topic signatures and individual terms.
*/
public TopicSignatureModel(IRSignatureIndexList srcIndexList, IntSparseMatrix cooccurMatrix) {
this.srcIndexList =srcIndexList;
this.cooccurMatrix =cooccurMatrix;
useMeanTrim=true;
probThreshold=0.001;
useEM=false;
iterationNum =15;
bkgCoeffi =0.5;
totalDestSignatureNum=cooccurMatrix.columns();
}
/**
* The constructor for the mode of EM algorithm
* @param srcIndexList the statisitcs of topic signatures in the collection.
* @param destIndexList the statisitcs of individual terms in the collection.
* @param cooccurMatrix the cooccurence matrix of topic signatures and individual terms.
*/
public TopicSignatureModel(IRSignatureIndexList srcIndexList, IRSignatureIndexList destIndexList, IntSparseMatrix cooccurMatrix) {
this.srcIndexList =srcIndexList;
this.destIndexList =destIndexList;
this.cooccurMatrix =cooccurMatrix;
useMeanTrim=true;
probThreshold=0.001;
useEM=true;
iterationNum =15;
bkgCoeffi =0.5;
totalDestSignatureNum=cooccurMatrix.columns();
}
/**
* The constructor for the mode of EM algorithm
* @param srcIndexList the statisitcs of topic signatures in the collection.
* @param srcSignatureDocMatrix srcSignatureDocMatrix the doc-term matrix for topic signatures.
* @param destIndexList the statisitcs of individual terms in the collection.
* @param destDocSignatureMatrix destDocSignatureMatrix the doc-term matrix for individual terms topic signatures will translate to.
*/
public TopicSignatureModel(IRSignatureIndexList srcIndexList, IntSparseMatrix srcSignatureDocMatrix, IRSignatureIndexList destIndexList, IntSparseMatrix destDocSignatureMatrix) {
this.srcIndexList =srcIndexList;
this.srcSignatureDocMatrix=srcSignatureDocMatrix;
this.destIndexList =destIndexList;
this.destDocSignatureMatrix=destDocSignatureMatrix;
useDocFrequency=true;
useMeanTrim=true;
probThreshold=0.001;
useEM=true;
iterationNum =15;
bkgCoeffi =0.5;
totalDestSignatureNum=destDocSignatureMatrix.columns();
}
public void setUseEM(boolean option){
this.useEM=option;
}
public boolean getUseEM(){
return useEM;
}
public void setEMBackgroundCoefficient(double coeffi){
this.bkgCoeffi =coeffi;
}
public double getEMBackgroundCoefficient(){
return this.bkgCoeffi;
}
public void setEMIterationNum(int iterationNum){
this.iterationNum =iterationNum;
}
public int getEMIterationNum(){
return this.iterationNum;
}
public void setUseDocFrequency(boolean option){
this.useDocFrequency=option;
}
public boolean getUseDocFrequency(){
return useDocFrequency;
}
public void setUseMeanTrim(boolean option){
this.useMeanTrim =option;
}
public boolean getUseMeanTrim(){
return useMeanTrim;
}
public void setProbThreshold(double threshold){
this.probThreshold =threshold;
}
public double getProbThreshold(){
return probThreshold;
}
public boolean genTransMatrix(int minDocFrequency,String matrixPath, String matrixKey){
ArrayList tokenList;
DoubleSuperSparseMatrix outputTransMatrix, outputTransTMatrix;
File file;
Token curToken;
String transIndexFile, transMatrixFile;
String transTIndexFile, transTMatrixFile;
int cellNum,rowNum;
int i, j;
transIndexFile=matrixPath+"/"+matrixKey+".index";
transMatrixFile=matrixPath+"/"+matrixKey+".matrix";
transTIndexFile=matrixPath+"/"+matrixKey+"t.index";
transTMatrixFile=matrixPath+"/"+matrixKey+"t.matrix";
file=new File(transMatrixFile);
if(file.exists()) file.delete();
file=new File(transIndexFile);
if(file.exists()) file.delete();
file=new File(transTMatrixFile);
if(file.exists()) file.delete();
file=new File(transTIndexFile);
if(file.exists()) file.delete();
outputTransMatrix=new DoubleSuperSparseMatrix(transIndexFile, transMatrixFile,false,false);
outputTransMatrix.setFlushInterval(Integer.MAX_VALUE);
outputTransTMatrix=new DoubleSuperSparseMatrix(transTIndexFile, transTMatrixFile,false,false);
outputTransTMatrix.setFlushInterval(Integer.MAX_VALUE);
cellNum=0;
rowNum=srcIndexList.size();
buf=new int[totalDestSignatureNum];
if(destDocSignatureMatrix!=null)
this.DOC_THRESH=computeDocThreshold(destDocSignatureMatrix);
for(i=0;i<rowNum;i++){
if(i%1000==0) System.out.println((new java.util.Date()).toString()+" Processing Row#"+i);
if (srcIndexList.getIRSignature(i).getDocFrequency() < minDocFrequency) continue;
if (cooccurMatrix!=null && cooccurMatrix.getNonZeroNumInRow(i)<5) continue;
tokenList=genSignatureTranslation(i);
for (j = 0; j <tokenList.size(); j++) {
curToken=(Token)tokenList.get(j);
outputTransMatrix.add(i,curToken.getIndex(),curToken.getWeight());
outputTransTMatrix.add(curToken.getIndex(), i, curToken.getWeight());
}
cellNum+=tokenList.size();
tokenList.clear();
if(cellNum>=5000000){
outputTransTMatrix.flush();
outputTransMatrix.flush();
cellNum=0;
}
}
outputTransTMatrix.finalizeData();
outputTransTMatrix.close();
outputTransMatrix.finalizeData();
outputTransMatrix.close();
return true;
}
public ArrayList genSignatureTranslation(int srcSignatureIndex){
ArrayList tokenList;
int[] arrDoc;
if(srcSignatureDocMatrix!=null){
arrDoc = srcSignatureDocMatrix.getNonZeroColumnsInRow(srcSignatureIndex);
if (arrDoc.length > DOC_THRESH)
tokenList = computeDistributionByArray(arrDoc);
else
tokenList = computeDistributionByHash(arrDoc);
}
else
tokenList=computeDistributionByCooccurMatrix(srcSignatureIndex);
if(useEM)
tokenList=emTopicSignatureModel(tokenList);
return tokenList;
}
private int computeDocThreshold(IntSparseMatrix doctermMatrix){
return (int)(doctermMatrix.columns()/computeAvgTermNum(doctermMatrix)/8.0);
}
private double computeAvgTermNum(IntSparseMatrix doctermMatrix){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -