coordinatingchecker.java

来自「dragontoolkit用于机器学习」· Java 代码 · 共 238 行
JAVA
238 行
package dragon.nlp.extract;

import dragon.nlp.*;
import dragon.nlp.tool.*;
import java.util.ArrayList;

/**
 * <p>Coordinating Component Identification</p>
 * <p> </p>
 * <p>Copyright: Copyright (c) 2003</p>
 * <p>Company: IST, Drexel Univeristy</p>
 * @author Davis Zhou
 * @version 1.0
 */

public class CoordinatingChecker {
    private int threshold;
    private int minCommaNum;

    public CoordinatingChecker() {
        threshold=4;
        minCommaNum=2;
    }

    public static void main(String[] args){
        Sentence sent;
        Tagger tagger;
        CoordinatingChecker checker=new CoordinatingChecker();
        Word curWord;

        sent=(new EngDocumentParser()).parseSentence("Obesity and type 2 diabetes mellitus are associated with many metabolic disorders including insulin resistance, dyslipidemia, hypertension or atherosclerosis");
        tagger=new MedPostTagger(System.getProperty("user.dir"));
        tagger.tag(sent);
        checker.identifyParaElements(sent);
        curWord=sent.getFirstWord();
        while(curWord!=null)
        {
            System.out.print(curWord.getContent());
            System.out.print(" ");
            System.out.print(curWord.getParallelGroup());
            System.out.print("\r\n");
            curWord=curWord.next;
        }
    }

    public int identifyParaElements(Sentence sent){
        int groupNo,commaNum,offset;
        int firstComma,lastComma;
        Word cur;

        commaNum=0;
        groupNo=0;
        offset=0;
        firstComma=-1;
        lastComma=-1;
        cur=sent.getFirstWord();
        while(cur!=null){
            if(cur.isPunctuation() && cur.getContent().equalsIgnoreCase(","))
            {
                if(commaNum==0)
                {
                    commaNum=1;
                    firstComma=offset;
                    lastComma=offset;
                }
                else if(offset<=lastComma+threshold+1){
                    commaNum=commaNum+1;
                    lastComma=offset;
                }
                else if(commaNum<minCommaNum){
                    commaNum=1;
                    firstComma=offset;
                    lastComma=offset;
                }
                else //find a possible parallel group
                {
                    if(processParallelGroup(sent,groupNo,firstComma,lastComma,commaNum)){
                        groupNo=groupNo+1;
                        commaNum=1;
                        firstComma = offset;
                        lastComma = offset;
                    }
                }
            }
            offset=offset+1;
            cur=cur.next;
        }
        if(commaNum>=minCommaNum)
            if(processParallelGroup(sent,groupNo,firstComma,lastComma,commaNum))
                groupNo=groupNo+1;
        return groupNo;
    }

    private boolean processParallelGroup(Sentence sent, int groupNo,int firstComma, int lastComma, int commaNum){
        Word start, end, cur;
        int step,pos;

        //get starting word
        if(firstComma<=0) return false; //in some strange case, the first word may be comma.

        cur=sent.getWord(firstComma-1);
        step=0;
        while(cur!=null)
        {
            pos=cur.getPOSIndex();
            if(pos==Tagger.POS_VERB || pos==Tagger.POS_ADVERB || pos==Tagger.POS_IN || pos==0 || pos==Tagger.POS_CC ){
                break;
            }
            else{
                cur = cur.prev;
                step=step+1;
            }
        }
        if(step==0)
        {
            start = cur.next.next;
            commaNum=commaNum-1;
        }
        else if(cur==null)
            start=sent.getFirstWord();
        else
            start=cur.next;

        //get ending word
        cur=sent.getWord(lastComma+1);
        step=0;
        while(cur!=null)
        {
            pos=cur.getPOSIndex();
            if(pos==Tagger.POS_VERB || pos==Tagger.POS_ADVERB || pos==Tagger.POS_IN || pos==0 ||
               (pos==Tagger.POS_CC && !cur.getContent().equalsIgnoreCase("and")) && !cur.getContent().equalsIgnoreCase("or")){
                break;
            }
            else{
                cur = cur.next;
                step=step+1;
            }
        }
        if(step==0)
        {
            end = cur.prev.prev;
            commaNum=commaNum-1;
        }
        else if(cur==null)
            end=sent.getLastWord();
        else
            end=cur.prev;

        if(commaNum<minCommaNum) return false;

        cur=start;
        while(!cur.equals(end)){
            cur.setParallelGroup(groupNo);
            cur=cur.next;
        }
        end.setParallelGroup(groupNo);

        return true;
    }

    public ArrayList parallelTermPredict(ArrayList termList) {
        Term curTerm, newTerm;
        Word curWord, prevWord, endWord, startWord;
        int curParaGroup, insertPos;
        int i;

        for (i = 0; i < termList.size(); i++) {
            curTerm = (Term) termList.get(i);
            curParaGroup = curTerm.getStartingWord().getParallelGroup();
            if (curParaGroup < 0) {
                continue;
            }

            //go back
            curWord = curTerm.getStartingWord().prev;
            endWord = curWord;
            prevWord = curTerm.getStartingWord();
            insertPos = i;

            while (endWord != null && curWord != null && curWord.getParallelGroup() == curParaGroup &&
                   curWord.getAssociatedConcept() == null) {
                if (curWord.getContent().equalsIgnoreCase(",") || curWord.getContent().equalsIgnoreCase("and")) {
                    if (!curWord.equals(endWord)) { //find a new term
                        newTerm = new Term(prevWord, endWord);
                        newTerm.setPredictedTerm(true);
                        termList.add(insertPos, newTerm);
                        prevWord.setAssociatedConcept(newTerm);
                        i = i + 1;
                    }
                    endWord = curWord.prev;
                }
                prevWord = curWord;
                curWord = curWord.prev;
            }
            if (curWord == null || curWord.getParallelGroup() != curParaGroup) {
                if (endWord != null && prevWord.getPosInSentence() <= endWord.getPosInSentence()) {
                    newTerm = new Term(prevWord, endWord);
                    newTerm.setPredictedTerm(true);
                    termList.add(insertPos, newTerm);
                    prevWord.setAssociatedConcept(newTerm);
                    i = i + 1;
                }
            }

            //go forth
            curWord = curTerm.getStartingWord().next;
            startWord = curWord;
            prevWord = curTerm.getStartingWord();
            insertPos = i + 1;

            while (startWord != null && curWord != null && curWord.getParallelGroup() == curParaGroup &&
                   curWord.getAssociatedConcept() == null) {
                if (curWord.getContent().equalsIgnoreCase(",") || curWord.getContent().equalsIgnoreCase("and")) {
                    if (!curWord.equals(startWord)) { //find a new term
                        newTerm = new Term(startWord, prevWord);
                        newTerm.setPredictedTerm(true);
                        termList.add(insertPos, newTerm);
                        prevWord.setAssociatedConcept(newTerm);
                        i = i + 1;
                    }
                    startWord = curWord.next;
                }
                prevWord = curWord;
                curWord = curWord.next;
            }
            if (curWord == null || curWord.getParallelGroup() != curParaGroup) {
                if (startWord != null && prevWord.getPosInSentence() >= startWord.getPosInSentence()) {
                    newTerm = new Term(startWord, prevWord);
                    newTerm.setPredictedTerm(true);
                    termList.add(insertPos, newTerm);
                    prevWord.setAssociatedConcept(newTerm);
                    i = i + 1;
                }
            }
        }
        return termList;
    }
}
coordinatingchecker.java - 源码说明

本页面展示了「dragontoolkit用于机器学习」中的 coordinatingchecker.java 源码文件，采用 Java 编程语言编写，共 238 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与dragontoolkit相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?