📄 topicsignaturemodel.java
字号:
Random random;
int i, num, index;
double sum;
random=new Random();
num=Math.min(50,doctermMatrix.rows());
sum=0;
for(i=0;i<num;i++){
index=random.nextInt(doctermMatrix.rows());
sum+=doctermMatrix.getNonZeroNumInRow(index);
}
return sum/num;
}
private ArrayList computeDistributionByCooccurMatrix(int signatureIndex){
ArrayList list;
Token curToken;
int[] arrIndex, arrFreq;
int i;
double rowTotal, mean;
rowTotal=0;
arrIndex=cooccurMatrix.getNonZeroColumnsInRow(signatureIndex);
arrFreq=cooccurMatrix.getNonZeroIntScoresInRow(signatureIndex);
for(i=0;i<arrFreq.length;i++)
rowTotal+=arrFreq[i];
if(useMeanTrim)
mean=rowTotal/arrFreq.length;
else
mean=0.5;
if(mean<rowTotal*getMinInitProb())
mean=rowTotal*getMinInitProb();
rowTotal=0;
list=new ArrayList();
for(i=0;i<arrFreq.length;i++){
if(arrFreq[i]>=mean){
list.add(new Token(arrIndex[i],arrFreq[i]));
rowTotal+=arrFreq[i];
}
}
for(i=0;i<list.size();i++){
curToken=(Token)list.get(i);
curToken.setWeight(curToken.getFrequency()/rowTotal);
}
return list;
}
private ArrayList computeDistributionByArray(int[] arrDoc){
ArrayList list;
Token curToken;
int[] arrIndex, arrFreq;
int i, j, k, nonZeroNum;
double rowTotal, mean;
rowTotal=0;
if(buf==null)
buf=new int[totalDestSignatureNum];
MathUtil.initArray(buf,0);
for(j=0;j<arrDoc.length;j++){
arrIndex = destDocSignatureMatrix.getNonZeroColumnsInRow(arrDoc[j]);
if(useDocFrequency)
arrFreq=null;
else
arrFreq=destDocSignatureMatrix.getNonZeroIntScoresInRow(arrDoc[j]);
for (k = 0; k <arrIndex.length; k++) {
if(useDocFrequency)
buf[arrIndex[k]]+=1;
else
buf[arrIndex[k]]+=arrFreq[k];
}
}
nonZeroNum=0;
for(i=0;i<buf.length;i++){
if (buf[i] > 0) {
nonZeroNum++;
rowTotal += buf[i];
}
}
if(useMeanTrim)
mean=rowTotal/nonZeroNum;
else
mean=0.5;
if(mean<rowTotal*getMinInitProb())
mean=rowTotal*getMinInitProb();
rowTotal=0;
list=new ArrayList();
for(i=0;i<buf.length;i++){
if(buf[i]>=mean){
list.add(new Token(i,buf[i]));
rowTotal+=buf[i];
}
}
for(i=0;i<list.size();i++){
curToken=(Token)list.get(i);
curToken.setWeight(curToken.getFrequency()/rowTotal);
}
return list;
}
private ArrayList computeDistributionByHash(int[] arrDoc){
ArrayList list, tokenList;
Token curToken;
int i;
double rowTotal, mean;
tokenList=countTokensByHashMap(arrDoc);
rowTotal=0;
for(i=0;i<tokenList.size();i++)
rowTotal+=((Token)tokenList.get(i)).getFrequency();
if(useMeanTrim || rowTotal*getMinInitProb()>1){
if(useMeanTrim)
mean=rowTotal/tokenList.size();
else
mean=0.5;
if(mean<rowTotal*getMinInitProb())
mean=rowTotal*getMinInitProb();
list=new ArrayList();
rowTotal=0;
for(i=0;i<tokenList.size();i++){
curToken=(Token)tokenList.get(i);
if(curToken.getFrequency()>=mean){
list.add(curToken);
rowTotal+=curToken.getFrequency();
}
}
tokenList.clear();
}
else
list=tokenList;
for(i=0;i<list.size();i++){
curToken=(Token)list.get(i);
curToken.setWeight(curToken.getFrequency()/rowTotal);
}
return list;
}
private ArrayList countTokensByHashMap(int[] arrDoc){
HashMap hash;
ArrayList list;
Token curToken;
Counter counter;
Iterator iterator;
int[] arrTerm, arrFreq;
int i,j, termNum;
hash=new HashMap();
for(j=0;j<arrDoc.length;j++){
termNum = destDocSignatureMatrix.getNonZeroNumInRow(arrDoc[j]);
if(termNum==0) continue;
arrTerm = destDocSignatureMatrix.getNonZeroColumnsInRow(arrDoc[j]);
if(useDocFrequency)
arrFreq=null;
else
arrFreq=destDocSignatureMatrix.getNonZeroIntScoresInRow(arrDoc[j]);
for(i=0;i<termNum;i++){
if (useDocFrequency)
curToken = new Token(arrTerm[i], 1);
else
curToken = new Token(arrTerm[i], arrFreq[i]);
counter=(Counter)hash.get(curToken);
if(counter==null){
counter=new Counter(curToken.getFrequency());
hash.put(curToken,counter);
}
else
counter.addCount(curToken.getFrequency());
}
}
list=new ArrayList(hash.size());
iterator=hash.keySet().iterator();
while(iterator.hasNext()){
curToken=(Token)iterator.next();
counter=(Counter)hash.get(curToken);
curToken.setFrequency(counter.getCount());
list.add(curToken);
}
hash.clear();
return list;
}
private double getMinInitProb(){
/*
if(useEM)
return Math.min(0.0001,probThreshold);
else
return probThreshold;*/
return probThreshold;
}
private ArrayList emTopicSignatureModel(ArrayList list){
Token curToken;
double[] arrCollectionProb, arrProb;
double weightSum;
int termNum;
int i, j;
termNum =list.size();
arrProb = new double[termNum];
//initialize the background model;
arrCollectionProb=new double[termNum];
weightSum=0;
for(i=0;i<termNum;i++){
curToken=(Token)list.get(i);
if(useDocFrequency)
arrCollectionProb[i]=destIndexList.getIRSignature(curToken.getIndex()).getDocFrequency();
else
arrCollectionProb[i]=destIndexList.getIRSignature(curToken.getIndex()).getFrequency();
weightSum+=arrCollectionProb[i];
}
for(i=0;i<termNum;i++)
arrCollectionProb[i]=arrCollectionProb[i]/weightSum;
//start EM
for (i = 0; i < iterationNum; i++) {
weightSum = 0;
for (j = 0; j < termNum; j++) {
curToken=(Token)list.get(j);
arrProb[j] = (1 - bkgCoeffi) * curToken.getWeight() /
( (1 - bkgCoeffi) * curToken.getWeight() + bkgCoeffi * arrCollectionProb[j]) * curToken.getFrequency();
weightSum += arrProb[j];
}
for (j = 0; j < termNum; j++){
curToken=(Token)list.get(j);
curToken.setWeight(arrProb[j]/ weightSum);
}
}
/*newList=new ArrayList(list.size());
for (j = 0; j < termNum; j++){
curToken=(Token)list.get(j);
if(curToken.getWeight()>=probThreshold)
newList.add(curToken);
}
return newList;*/
return list;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -