⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 topicsignaturemodel.java

📁 dragontoolkit用于机器学习
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
        Random random;
        int i, num, index;
        double sum;

        random=new Random();
        num=Math.min(50,doctermMatrix.rows());
        sum=0;
        for(i=0;i<num;i++){
            index=random.nextInt(doctermMatrix.rows());
            sum+=doctermMatrix.getNonZeroNumInRow(index);
        }
        return sum/num;
    }

    private ArrayList computeDistributionByCooccurMatrix(int signatureIndex){
        ArrayList list;
        Token curToken;
        int[] arrIndex, arrFreq;
        int i;
        double rowTotal, mean;

        rowTotal=0;
        arrIndex=cooccurMatrix.getNonZeroColumnsInRow(signatureIndex);
        arrFreq=cooccurMatrix.getNonZeroIntScoresInRow(signatureIndex);
        for(i=0;i<arrFreq.length;i++)
            rowTotal+=arrFreq[i];
        if(useMeanTrim)
            mean=rowTotal/arrFreq.length;
        else
            mean=0.5;
        if(mean<rowTotal*getMinInitProb())
            mean=rowTotal*getMinInitProb();

        rowTotal=0;
        list=new ArrayList();
        for(i=0;i<arrFreq.length;i++){
            if(arrFreq[i]>=mean){
                list.add(new Token(arrIndex[i],arrFreq[i]));
                rowTotal+=arrFreq[i];
            }
        }
        for(i=0;i<list.size();i++){
            curToken=(Token)list.get(i);
            curToken.setWeight(curToken.getFrequency()/rowTotal);
        }
        return list;
    }

    private ArrayList computeDistributionByArray(int[] arrDoc){
        ArrayList list;
        Token curToken;
        int[] arrIndex, arrFreq;
        int i, j, k, nonZeroNum;
        double rowTotal, mean;

        rowTotal=0;
        if(buf==null)
            buf=new int[totalDestSignatureNum];
        MathUtil.initArray(buf,0);
        for(j=0;j<arrDoc.length;j++){
            arrIndex = destDocSignatureMatrix.getNonZeroColumnsInRow(arrDoc[j]);
            if(useDocFrequency)
                arrFreq=null;
            else
                arrFreq=destDocSignatureMatrix.getNonZeroIntScoresInRow(arrDoc[j]);
            for (k = 0; k <arrIndex.length; k++) {
                if(useDocFrequency)
                    buf[arrIndex[k]]+=1;
                else
                    buf[arrIndex[k]]+=arrFreq[k];
            }
        }

        nonZeroNum=0;
        for(i=0;i<buf.length;i++){
            if (buf[i] > 0) {
                nonZeroNum++;
                rowTotal += buf[i];
            }
        }
        if(useMeanTrim)
            mean=rowTotal/nonZeroNum;
        else
            mean=0.5;
        if(mean<rowTotal*getMinInitProb())
            mean=rowTotal*getMinInitProb();

        rowTotal=0;
        list=new ArrayList();
        for(i=0;i<buf.length;i++){
            if(buf[i]>=mean){
                list.add(new Token(i,buf[i]));
                rowTotal+=buf[i];
            }
        }
        for(i=0;i<list.size();i++){
            curToken=(Token)list.get(i);
            curToken.setWeight(curToken.getFrequency()/rowTotal);
        }
        return list;
    }

    private ArrayList computeDistributionByHash(int[] arrDoc){
        ArrayList list, tokenList;
        Token curToken;
        int i;
        double rowTotal, mean;

        tokenList=countTokensByHashMap(arrDoc);
        rowTotal=0;
        for(i=0;i<tokenList.size();i++)
            rowTotal+=((Token)tokenList.get(i)).getFrequency();

        if(useMeanTrim || rowTotal*getMinInitProb()>1){
            if(useMeanTrim)
                mean=rowTotal/tokenList.size();
            else
                mean=0.5;
            if(mean<rowTotal*getMinInitProb())
                mean=rowTotal*getMinInitProb();
            list=new ArrayList();
            rowTotal=0;
            for(i=0;i<tokenList.size();i++){
                curToken=(Token)tokenList.get(i);
                if(curToken.getFrequency()>=mean){
                    list.add(curToken);
                    rowTotal+=curToken.getFrequency();
                }
            }
            tokenList.clear();
        }
        else
            list=tokenList;

        for(i=0;i<list.size();i++){
            curToken=(Token)list.get(i);
            curToken.setWeight(curToken.getFrequency()/rowTotal);
        }
        return list;
    }

    private ArrayList countTokensByHashMap(int[] arrDoc){
        HashMap hash;
        ArrayList list;
        Token curToken;
        Counter counter;
        Iterator iterator;
        int[] arrTerm, arrFreq;
        int i,j, termNum;

        hash=new HashMap();
        for(j=0;j<arrDoc.length;j++){
            termNum = destDocSignatureMatrix.getNonZeroNumInRow(arrDoc[j]);
            if(termNum==0) continue;

            arrTerm = destDocSignatureMatrix.getNonZeroColumnsInRow(arrDoc[j]);
            if(useDocFrequency)
                arrFreq=null;
            else
                arrFreq=destDocSignatureMatrix.getNonZeroIntScoresInRow(arrDoc[j]);
            for(i=0;i<termNum;i++){
                if (useDocFrequency)
                    curToken = new Token(arrTerm[i], 1);
                else
                    curToken = new Token(arrTerm[i], arrFreq[i]);
                counter=(Counter)hash.get(curToken);
                if(counter==null){
                    counter=new Counter(curToken.getFrequency());
                    hash.put(curToken,counter);
                }
                else
                    counter.addCount(curToken.getFrequency());
            }
        }

        list=new ArrayList(hash.size());
        iterator=hash.keySet().iterator();
        while(iterator.hasNext()){
            curToken=(Token)iterator.next();
            counter=(Counter)hash.get(curToken);
            curToken.setFrequency(counter.getCount());
            list.add(curToken);
        }
        hash.clear();
        return list;
    }

    private double getMinInitProb(){
        /*
        if(useEM)
            return Math.min(0.0001,probThreshold);
        else
            return probThreshold;*/
        return probThreshold;
    }

    private ArrayList emTopicSignatureModel(ArrayList list){
        Token curToken;
        double[]  arrCollectionProb, arrProb;
        double weightSum;
        int termNum;
        int i, j;

        termNum =list.size();
        arrProb = new double[termNum];

        //initialize the background model;
        arrCollectionProb=new double[termNum];
        weightSum=0;
        for(i=0;i<termNum;i++){
            curToken=(Token)list.get(i);
            if(useDocFrequency)
                arrCollectionProb[i]=destIndexList.getIRSignature(curToken.getIndex()).getDocFrequency();
            else
                arrCollectionProb[i]=destIndexList.getIRSignature(curToken.getIndex()).getFrequency();
            weightSum+=arrCollectionProb[i];
        }
        for(i=0;i<termNum;i++)
            arrCollectionProb[i]=arrCollectionProb[i]/weightSum;

        //start EM
        for (i = 0; i < iterationNum; i++) {
            weightSum = 0;
            for (j = 0; j < termNum; j++) {
                curToken=(Token)list.get(j);
                arrProb[j] = (1 - bkgCoeffi) * curToken.getWeight() /
                    ( (1 - bkgCoeffi) * curToken.getWeight() + bkgCoeffi * arrCollectionProb[j]) * curToken.getFrequency();
                weightSum += arrProb[j];
            }
            for (j = 0; j < termNum; j++){
                curToken=(Token)list.get(j);
                curToken.setWeight(arrProb[j]/ weightSum);
            }
        }

        /*newList=new ArrayList(list.size());
        for (j = 0; j < termNum; j++){
            curToken=(Token)list.get(j);
            if(curToken.getWeight()>=probThreshold)
                newList.add(curToken);
        }
        return newList;*/
        return list;
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -