⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 kldivdocdistance.java

📁 dragontoolkit用于机器学习
💻 JAVA
字号:
package dragon.ir.clustering.docdistance;

import dragon.ir.index.*;
import dragon.matrix.*;

/**
 * <p>Document distance measure of using KL divergence</p>
 * <p></p>
 * <p>Copyright: Copyright (c) 2005</p>
 * <p>Company: IST, Drexel University</p>
 * @author Davis Zhou
 * @version 1.0
 */

public class KLDivDocDistance extends AbstractDocDistance{
    private double bkgCoefficient, normThreshold;
    private double[] arrBkgModel;
    private boolean norm;

    public KLDivDocDistance(DoubleSparseMatrix docModelMatrix){
        this(docModelMatrix,0);
    }

    public KLDivDocDistance(DoubleSparseMatrix doctermMatrix, double normThreshold){
        super(doctermMatrix);

        int i, termNum;
        this.bkgCoefficient =0.1;
        termNum=matrix.columns();
        arrBkgModel=new double[termNum];
        for(i=0;i<termNum;i++)
            arrBkgModel[i]=bkgCoefficient/termNum;
        if(normThreshold>0){
            norm = true;
            this.normThreshold =normThreshold;
        }
        else{
            norm=false;
            this.normThreshold =0;
        }
    }

    public KLDivDocDistance(IndexReader indexReader, DoubleSparseMatrix docModelMatrix){
        this(indexReader,docModelMatrix,0);
    }

    public KLDivDocDistance(IndexReader indexReader, DoubleSparseMatrix doctermMatrix, double normThreshold) {

        super(doctermMatrix);

        double totalTermCount;
        int i, termNum;

        this.bkgCoefficient =0.1;
        termNum=indexReader.getCollection().getTermNum();
        arrBkgModel=new double[termNum];
        totalTermCount=indexReader.getCollection().getTermCount();
        for(i=0;i<termNum;i++)
            arrBkgModel[i]=indexReader.getIRTerm(i).getFrequency()/totalTermCount*bkgCoefficient;
        if(normThreshold>0){
            norm = true;
            this.normThreshold =normThreshold;
        }
        else{
            norm=false;
            this.normThreshold =0;
        }
    }

    public double getDistance(IRDoc first, IRDoc second){
        double[] arrFirstScore, arrSecondScore;
        double distance, firstProb, secondProb;
        int[] arrFirstIndex,arrSecondIndex;
        int i, j, firstCount, secondCount;

        arrFirstIndex=matrix.getNonZeroColumnsInRow(first.getIndex());
        arrFirstScore=matrix.getNonZeroDoubleScoresInRow(first.getIndex());
        firstCount=matrix.getNonZeroNumInRow(first.getIndex());
        arrSecondIndex=matrix.getNonZeroColumnsInRow(second.getIndex());
        arrSecondScore=matrix.getNonZeroDoubleScoresInRow(second.getIndex());
        secondCount=matrix.getNonZeroNumInRow(second.getIndex());
        distance=0;
        i=0;
        j=0;

        while(i<firstCount){
            while(j<secondCount && arrSecondIndex[j]<arrFirstIndex[i]) j++;
            if(j>=secondCount || arrSecondIndex[j]!=arrFirstIndex[i])
                secondProb=arrBkgModel[arrFirstIndex[i]];
            else
                secondProb=(1-bkgCoefficient)*arrSecondScore[j]+arrBkgModel[arrFirstIndex[i]];
            firstProb=(1-bkgCoefficient)*arrFirstScore[i]+arrBkgModel[arrFirstIndex[i]];
            distance += firstProb *Math.log(firstProb/secondProb);
            i++;
        }

        if(norm){
            if(distance>=normThreshold)
                return 1;
            else
                return distance/normThreshold;
        }
        else
            return distance;
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -