umlsambiguityontology.java

来自「dragontoolkit用于机器学习」· Java 代码 · 共 506 行 · 第 1/2 页

JAVA
506
字号
        double[] arrCandidateScore;
        double score;

        candidateNum=canTerm.getCandidateCUINum();
        arrCandidateCUI=new int[candidateNum];
        arrCandidateScore=new double[candidateNum];
        for(i=0;i<candidateNum;i++)
        {
            arrCandidateCUI[i] = canTerm.getCandidateCUI(i).getIndex();
            arrCandidateScore[i] = canTerm.getCandidateCUI(i).getScore();
        }
        contextList = generateContextWindow(canTerm.getStartingWord().getParent(), canTerm.getStartingWord(), canTerm.getEndingWord());

        for(i=0;i<contextList.size();i++)
        {
            curWord=(Word)contextList.get(i);
            index = getIndexInTokenList(curWord);
            if (index<0) continue;
            narrowedNum = 0;
            for (j = 0; j < candidateNum; j++) {
                if ( (score = wtMatrix.getDouble(index, arrCandidateCUI[j])) > 0) {
                    arrCandidateCUI[narrowedNum] = arrCandidateCUI[j];
                    arrCandidateScore[narrowedNum] = arrCandidateScore[j] + score;
                    narrowedNum += 1;
                }
            }
            if (narrowedNum > 0) candidateNum = narrowedNum;
        }
        if(candidateNum<canTerm.getCandidateCUINum())
            canTerm=buildCandidateTerm(canTerm.getStartingWord(),canTerm.getEndingWord(),arrCandidateCUI,arrCandidateScore,candidateNum,minScore);
        contextList.clear();
        return canTerm;
    }

    private ArrayList searchAllCandidates(Word start, Word end, double minScore){
        ArrayList canTermList;
        CandidateTerm  canTerm;
        Sentence sent;
        Word curWord;
        int candidateNum, narrowedNum;
        int j, index;
        int[] arrCandidateCUI;
        double[] arrCandidateScore;
        double score;
        int skippedWords;

        if((index=getIndexInTokenList(start))<0) return null;
        sent=start.getParent();

        //set the right bounary of the possible term.
        curWord=start.next;
        if(end==null){
            j = 0;
            while (j <4 && curWord != null && end==null) {
                if(isBoundaryWord(curWord))
                    end = curWord.prev;
                if (!curWord.isPunctuation())
                    j++;
                curWord = curWord.next;
            }
            if (curWord == null)
                curWord = sent.getLastWord();
            if (end == null)
                end = curWord;
        }

        //get the number of none zero columns
        candidateNum = wtMatrix.getNonZeroNumInRow(index);
        //if none of them contain it, continue
        if (candidateNum <= 0) return null;
        //get array of non zero column index, that is the CUIs that contain the word
        arrCandidateCUI = wtMatrix.getNonZeroColumnsInRow(index);
        //get array of non zero column scores
        arrCandidateScore = wtMatrix.getNonZeroDoubleScoresInRow(index);

        canTermList=new ArrayList(3);
        if((canTerm=buildCandidateTerm(start,start,arrCandidateCUI,arrCandidateScore,candidateNum,minScore))!=null)
            canTermList.add(canTerm);

        //extract tokens within window
        curWord=start.next;
        skippedWords=0;
        while (curWord!=null && skippedWords<=maxSkippedWords && curWord.getPosInSentence() <=end.getPosInSentence()) {
            if (! isUsefulForTerm(curWord)) {
                curWord = curWord.next;
                continue;
            }

            index=getIndexInTokenList(curWord);
            if(index<0) {
                curWord = curWord.next;
                skippedWords++;
                continue;
            }
            narrowedNum = 0;
            for (j = 0; j < candidateNum; j++) {
                if ( (score = wtMatrix.getDouble(index,arrCandidateCUI[j])) > 0) {
                    arrCandidateCUI[narrowedNum] = arrCandidateCUI[j];
                    arrCandidateScore[narrowedNum] = arrCandidateScore[j] + score;
                    narrowedNum += 1;
                }
            }
            if (narrowedNum > 0)
            {
                candidateNum = narrowedNum;
                if((canTerm=buildCandidateTerm(start,curWord,arrCandidateCUI,arrCandidateScore,candidateNum,minScore))!=null)
                    canTermList.add(canTerm);
                skippedWords=0;
            }
            else{
                skippedWords++;
            }
            curWord = curWord.next;
        }
        return canTermList;
    }

    /*The follow fucnction generates the contextual words for a term specified by its starting word and ending word.
    We take up to three three left words (noun or adjective) and up to three right words as the window.*/
    private ArrayList generateContextWindow(Sentence sent, Word start, Word end) {
        ArrayList contexts;
        Word cur;
        int i;

        contexts = new ArrayList(6);
        cur = start.prev;
        i = 0;
        while (i < 3 && cur != null) {
            if (cur.getPOSIndex() == Tagger.POS_NOUN || cur.getPOSIndex() == Tagger.POS_ADJECTIVE) {
                contexts.add(cur);
                i = i + 1;
            }
            cur = cur.prev;
        }

        cur = start.next;
        i = 0;
        while (i < 3 && cur != null) {
            if (cur.getPOSIndex() == Tagger.POS_NOUN || cur.getPOSIndex() == Tagger.POS_ADJECTIVE) {
                contexts.add(cur);
                i = i + 1;
            }
            cur = cur.next;
        }

        return contexts;
    }

    private CandidateTerm buildCandidateTerm(Word starting, Word ending, int[] arrCandidateCUI, double[] arrCandidateScore, int candidateNum,double minScore){
        CandidateTerm cTerm;
        int i;

        if(ending.getPOSIndex()==Tagger.POS_ADJECTIVE && (!getAdjectiveTermOption() || !ending.equals(starting))) return null;
        if(ending.getPOSIndex()==Tagger.POS_NUM && ending.equals(starting)) return null;
        if(1.0/candidateNum<minSelectivity) return null;

        cTerm=new CandidateTerm(starting, ending);
        for(i=0;i<candidateNum;i++)
            if(arrCandidateScore[i]>=minScore)
                cTerm.addCandidateCUI(new CandidateCUI(arrCandidateCUI[i],arrCandidateScore[i]));
        if(cTerm.getCandidateCUINum()>0){
            if(ending.getPOSIndex()==Tagger.POS_ADJECTIVE && cTerm.getCandidateCUI(0).getScore()<1)
                return null;
            else
                return cTerm;
        }
        else
            return null;
    }

    private int getIndexInTokenList(Word word){

        if(word.getIndex()==Integer.MIN_VALUE)
        {
            Token token=tokenList.lookup(getLemma(word));;
            if (token==null)
                word.setIndex(-1);
            else
                word.setIndex(token.getIndex());
        }
        return word.getIndex();
    }

    private class CandidateCUI implements Comparable{
        private int index;
        private double score;

        public CandidateCUI(int index, double score){
            this.index=index;
            this.score=score;
        }

        public double getScore(){
            return score;
        }

        public int getIndex(){
            return index;
        }

        public int compareTo(Object obj){
            double objScore;
            int objIndex;

            objScore=((CandidateCUI)obj).getScore();
            if(score>objScore)
                return -1;
            else if(score<objScore)
                return 1;
            else
            {
                objIndex=((CandidateCUI)obj).getIndex();
                if(index>objIndex)
                    return 1;
                else if(index<objIndex)
                    return -1;
                else
                    return 0;
            }
        }
    }

    private class CandidateTerm {
        Word starting, ending;
        SortedArray candidates;

        public CandidateTerm(Word starting, Word ending){
            this.starting=starting;
            this.ending=ending;
            candidates=new SortedArray();
        }

        public void addCandidateCUI(CandidateCUI cui){
            candidates.add(cui);
        }

        public Word getStartingWord(){
            return starting;
        }

        public Word getEndingWord(){
            return ending;
        }

        public CandidateCUI getCandidateCUI(int index){
            return (CandidateCUI)candidates.get(index);
        }

        public int getCandidateCUINum(){
            return candidates.size();
        }
    }
}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?