⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 engwordpairexpand.java

📁 dragontoolkit用于机器学习
💻 JAVA
字号:
package dragon.nlp.tool.xtract;

import dragon.nlp.*;
import dragon.util.*;
import dragon.nlp.tool.*;
import dragon.matrix.*;
import dragon.nlp.compare.*;
import java.util.*;

/**
 * <p>Expanding word pair (not necessary consecutive) to noun phrase</p>
 * <p> </p>
 * <p>Copyright: Copyright (c) 2005</p>
 * <p>Company: IST, Drexel University</p>
 * @author Davis Zhou
 * @version 1.0
 */

public class EngWordPairExpand implements WordPairExpand{
    protected IntSparseMatrix sentMatrix;
    protected SimpleElementList wordList;
    protected int maxSpan;
    protected String indexFolder;
    protected IntSparseMatrix[] arrPairSentLeftMatrix, arrPairSentRightMatrix;
    protected double threshold;

    public EngWordPairExpand(int maxSpan, String indexFolder, double threshold) {
        this.maxSpan = maxSpan;
        this.threshold = threshold;
        this.indexFolder = indexFolder;

        wordList = new SimpleElementList(indexFolder + "/wordkey.list", false);
        arrPairSentRightMatrix = new IntSparseMatrix[maxSpan];
        sentMatrix = new IntSuperSparseMatrix(indexFolder + "/sentencebase.index",indexFolder + "/sentencebase.matrix");
        for (int i = 1; i <= maxSpan; i++) {
            arrPairSentRightMatrix[i - 1] = new IntGiantSparseMatrix(indexFolder + "/pairsentr" + i +".index",
                                              indexFolder + "/pairsentr" + i +".matrix");
        }
        arrPairSentLeftMatrix = new IntSparseMatrix[maxSpan];
        for (int i = 1; i <= maxSpan; i++) {
            arrPairSentLeftMatrix[i -1] = new IntGiantSparseMatrix(indexFolder + "/pairsentl" + i + ".index",
                                              indexFolder + "/pairsentl" + i + ".matrix");
        }
    }

    public ArrayList expand(WordPairStat wordPairStat, int span) {
        ArrayList sentList, phraseList;
        Token token;
        String expandStr;
        int sentNum, firstWord, secondWord;
        boolean pass;

        try{
            firstWord = wordPairStat.getFirstWord();
            secondWord = wordPairStat.getSecondWord();
            expandStr = null;

            //return all sentences containing the word pair and the position of the first word in the corresponding sentence
            sentList =getSentenceList(wordPairStat,span);
            sentNum = sentList.size();

            //expand middle
            pass = true;
            if (span > 1 || span < -1) {
                if (span > 1)
                    token = expandSecion(1, span - 1, sentNum, false, 0, sentList);
                else
                    token = expandSecion(1, -span - 1, sentNum, true, 0, sentList);
                if (token == null)
                    pass = false;
                else {
                    pass = true;
                    sentList = (ArrayList) token.getMemo();
                    if (span > 1)
                        expandStr = getWordContent(firstWord) + " " + token.getName().trim() + " " +getWordContent(secondWord);
                    else
                        expandStr = getWordContent(secondWord) + " " + token.getName().trim() + " " +getWordContent(firstWord);
                }
            }
            else {
                if (span == 1)
                    expandStr = (getWordContent(firstWord)+ " " + getWordContent(secondWord)).trim();
                else
                    expandStr = (getWordContent(secondWord).trim() + " " + getWordContent(firstWord)).trim();
            }

            if (!pass)
                return null;

            //expand left
            if (span > 0)
                token = expandSecion(1, maxSpan, sentNum, true, -1, sentList);
            else
                token = expandSecion( -span + 1, maxSpan - span, sentNum, true, -1, sentList);
            if (token != null) {
                sentList = (ArrayList) token.getMemo();
                expandStr = token.getName().trim() + " " + expandStr;
            }

            //expand right
            if (span > 0)
                token = expandSecion(span + 1, span + maxSpan, sentNum, false, 1, sentList);
            else
                token = expandSecion(1, maxSpan, sentNum, false, 1, sentList);
            if (token != null) {
                sentList = (ArrayList) token.getMemo();
                expandStr = expandStr + " " + token.getName().trim();
            }

            phraseList=new ArrayList(1);
            phraseList.add(new Token(expandStr.trim()));
            return phraseList;
        }
        catch(Exception e){
            e.printStackTrace();
            return null;
        }
    }

    protected ArrayList getSentenceList(WordPairStat wordPairStat, int span){
        IntSparseMatrix pairSentMatrix;
        ArrayList sentList;
        Token sentToken;
        int i, j, firstWord, secondWord, wordKey, sentIndex, sentLength, sentNum, pairIndex;

        pairIndex = wordPairStat.getIndex();
        firstWord = wordPairStat.getFirstWord();
        secondWord = wordPairStat.getSecondWord();

        if (span < 0)
            pairSentMatrix = arrPairSentLeftMatrix[ -span - 1];
        else
            pairSentMatrix = arrPairSentRightMatrix[span - 1];
        sentNum = pairSentMatrix.getNonZeroNumInRow(pairIndex);
        sentList = new ArrayList(sentNum);

        for (i = 0; i < sentNum; i++) {
            sentIndex = pairSentMatrix.getNonZeroColumnInRow(pairIndex, i);
            sentLength = sentMatrix.getNonZeroNumInRow(sentIndex);
            for (j = 0; j < sentLength; j++) {
                wordKey = sentMatrix.getNonZeroColumnInRow(sentIndex, j);
                if (wordKey != firstWord)
                    continue;
                if ( (j + span) >= 0 && (j + span) <sentLength &&
                    sentMatrix.getNonZeroColumnInRow(sentIndex, j + span) == secondWord) {
                    sentToken = new Token(String.valueOf( sentIndex));
                    sentToken.setIndex(j);
                    sentToken.setFrequency(sentMatrix.getNonZeroIntScoreInRow(sentIndex,j+span));
                    sentList.add(sentToken);
                }
                break;
            }
        }
        return sentList;
    }

    //direction 0:middle -1:left, 1:right
    protected Token expandSecion(int start, int end, int sentNum, boolean inverse, int direction, ArrayList sentList) {
        Token token;
        String expandStr, word, marginalWord;
        int posIndex, marginalPOS;
        int i, j, pos;

        expandStr="";
        marginalWord=null;
        marginalPOS=-1;

        for (i=start; i<= end; i++) {
            if(inverse)
                j=-i;
            else
                j=i;
            token =checkSentPos(j, sentList);
            if (token!= null){
                sentList = (ArrayList) token.getMemo();
                if( (token.getFrequency() / (double) sentNum) >=threshold) {
                    //if the direction is not middle, check if the word is valid as a part of the phrase
                    word = (getWordContent(Integer.parseInt(token.getName())));
                    posIndex=token.getIndex();
                    if (direction==0 || checkValidation(word, posIndex)) {
                       if (inverse)
                            expandStr = word+" " + expandStr;
                        else
                            expandStr = expandStr + " " + word;
                        expandStr=expandStr.trim();
                        if(direction==1 && !inverse || direction==-1 && inverse){
                            marginalWord=word;
                            marginalPOS=posIndex;
                        }
                    }
                    else
                        break;
                }
                else
                    break;
            }
            else
                break;
        }

        if(i<=end && direction==0)
            return null;
        else if (!expandStr.equals("")) {
            if(direction==1 && !inverse && !checkEndingWordValidation(marginalWord,marginalPOS)){
                pos=expandStr.lastIndexOf(' ');
                if(pos>=0)
                    expandStr=expandStr.substring(0,pos);
                else
                    return null;
            }
            else if(direction==-1 && inverse && !checkStartingWordValidation(marginalWord,marginalPOS)){
                pos=expandStr.indexOf(' ');
                if(pos>=0)
                    expandStr=expandStr.substring(pos+1);
                else
                    return null;
            }
            token = new Token(expandStr);
            token.setMemo(sentList);
            return token;
        }
        else
            return null;
    }

    protected Token checkSentPos(int spanFromFirstWord, ArrayList sentList) {
        SortedArray tokenList;
        ArrayList sList;
        Token wordToken, sentToken;
        int i, sentIndex, sentLength, firstWordPos, wordKey, tokenIndex;

        tokenList = new SortedArray();

        for (i = 0; i < sentList.size(); i++) {
            sentToken = (Token) sentList.get(i);
            sentIndex = Integer.parseInt(sentToken.getName());
            sentLength = sentMatrix.getNonZeroNumInRow(sentIndex);
            firstWordPos = sentToken.getIndex();

            if ( (firstWordPos + spanFromFirstWord) >= 0 && (firstWordPos + spanFromFirstWord < sentLength)) {
                wordKey = sentMatrix.getNonZeroColumnInRow(sentIndex,firstWordPos + spanFromFirstWord);
                wordToken = new Token(String.valueOf(wordKey));
                tokenIndex = tokenList.binarySearch(wordToken);
                if (tokenIndex < 0) {
                    sList = new ArrayList();
                    sList.add(sentToken);
                    wordToken.setFrequency(1);
                    wordToken.setIndex(sentMatrix.getNonZeroIntScoreInRow(sentIndex,firstWordPos + spanFromFirstWord)); //the part of speech
                    wordToken.setMemo(sList);
                    tokenList.add(wordToken);
                }
                else {
                    wordToken = (Token) tokenList.get(tokenIndex);
                    wordToken.addFrequency(1);
                    sList = (ArrayList) wordToken.getMemo();
                    sList.add(sentToken);
                }
            }
        }

        if(tokenList.size()>0){
            tokenList.setComparator(new FrequencyComparator(true));
            wordToken = (Token) tokenList.get(0);
            tokenList.clear();
            return wordToken;
        }
        else
            return null;
    }

    protected String getWordContent(int index){
        return wordList.search(index).trim();
    }

    protected boolean checkValidation(String word, int posIndex){
        if(posIndex==Tagger.POS_ADJECTIVE || posIndex==Tagger.POS_NOUN || posIndex==0 && word.equals("-"))
            return true;
        return false;
    }

    protected boolean checkEndingWordValidation(String word, int posIndex){
        if(posIndex==Tagger.POS_NOUN)
            return true;
        else
            return false;
    }

    protected boolean checkStartingWordValidation(String word, int posIndex){
        if(posIndex==Tagger.POS_NOUN || posIndex==Tagger.POS_ADJECTIVE)
            return true;
        else
            return false;
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -