📄 approxdictionarychunker.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
        }        HashMap dpToChunk = new HashMap();        HashMap queue = new HashMap();        for (int i = 0; i < length; ++i) {            int startPlusI = start + i;            char c = cs[startPlusI];            if (startTokens[i]) {                add(queue,mDictionary.mRootNode,startPlusI,                    0.0,                    false,dpToChunk,cs,startPlusI);            }            HashMap nextQueue = new HashMap();            double deleteCost = -mEditDistance.deleteWeight(c);            Iterator it = queue.values().iterator();            while (it.hasNext()) {                SearchState state = (SearchState) it.next();                // delete                add(nextQueue,state.mNode,state.mStartIndex,                    state.mScore + deleteCost,                    endTokens[i+1],dpToChunk,cs,startPlusI);                // match or subst                char[] dtrChars = state.mNode.mDtrChars;                Node[] dtrNodes = state.mNode.mDtrNodes;                for (int j = 0; j < dtrChars.length; ++j) {                    add(nextQueue,dtrNodes[j],state.mStartIndex,                        state.mScore                        - (dtrChars[j] == c                           ?  mEditDistance.matchWeight(dtrChars[j])                           : mEditDistance.substituteWeight(dtrChars[j],c)),                        endTokens[i+1],dpToChunk,cs,startPlusI);                }            }            queue = nextQueue;        }        ChunkingImpl result = new ChunkingImpl(cs,start,end);        Iterator it = dpToChunk.values().iterator();        while (it.hasNext())            result.add((Chunk)it.next());        return result;    }    void add(HashMap nextQueue, Node node, int startIndex,             double chunkScore,             boolean isTokenEnd, HashMap chunking, char[] cs, int end) {        if (chunkScore > mDistanceThreshold)            return;            SearchState state2             = new SearchState(node,startIndex,chunkScore);            SearchState exState = (SearchState) nextQueue.get(state2);        if (exState != null && exState.mScore < chunkScore)            return;        nextQueue.put(state2,state2);            // finish match at token end by adding each cat (may be 0)        if (isTokenEnd) {            for (int i = 0; i < node.mEntries.length; ++i) {                Chunk newChunk                     = ChunkFactory                    .createChunk(startIndex,end+1,                                 node.mEntries[i].category().toString(),                                 chunkScore);                Object dpNewChunk = new Dp(newChunk);                Chunk oldChunk = (Chunk) chunking.get(dpNewChunk);                if (oldChunk != null && oldChunk.score() <= chunkScore)                    continue;                chunking.remove(dpNewChunk);                chunking.put(dpNewChunk,newChunk);            }        }            // insert        for (int i = 0; i < node.mDtrChars.length; ++i)            add(nextQueue,node.mDtrNodes[i],startIndex,                chunkScore - mEditDistance.insertWeight(node.mDtrChars[i]),                isTokenEnd,chunking,cs,end);    }    // chunk's data less score for efficient dynamic programming key    static final class Dp {        final int mStart;        final int mEnd;        final String mType;        int mHashCode;        Dp(Chunk chunk) {            mStart = chunk.start();            mEnd = chunk.end();            mType = chunk.type();            mHashCode = mStart + 31 * (mEnd + 31 * mType.hashCode());        }        public int hashCode() {            return mHashCode;        }        public boolean equals(Object that) {            Dp thatDp = (Dp) that;            return mStart == thatDp.mStart                && mEnd == thatDp.mEnd                && mType.equals(thatDp.mType);        }    }    static final class SearchState implements Scored {        private final double mScore;        private final Node mNode;        private final int mStartIndex; // absolute in cs        SearchState(Node node, int startIndex) {            this(node,startIndex,0.0);        }        SearchState(Node node, int startIndex, double score) {            mNode = node;            mStartIndex = startIndex;            mScore = score;        }        public double score() {            return mScore;        }        public boolean equals(Object that) {            SearchState thatState = (SearchState) that;            return mStartIndex == thatState.mStartIndex                && mNode == thatState.mNode;        }        public int hashCode() {            return mStartIndex; //  + 31 * mNode.hashCode();        }        public String toString() {            return "SearchState(" + mNode                + ", " + mStartIndex                + ", " + mScore + ")";        }    }    /**     * This is a weighted edit distance defined by Tsuruoka and Tsujii     * for matching protein names in biomedical texts.  Reproducing     * table 1 from their paper provides the weighting function     * (converting slightly to our terminology and scale):     *     * <blockquote>     * <table border='1' cellpadding='5'>     * <tr><td><b><i>Operation</i></b></td>     *     <td><b><i>Character</i></b></td>     *     <td><b><i>Cost</i></b></td></tr>     *     * <tr><td rowspan='2'><i>Insertion</i></td>     *     <td>space or hyphen</td>     *     <td>-10</td></tr>     * <tr><td>other characters</td>     *     <td>-100</td></tr>     *     * <tr><td rowspan='2'><i>Deletion</i></td>     *     <td>space or hyphen</td>     *     <td>-10</td></tr>     * <tr><td>other characters</td>     *     <td>-100</td></tr>     *     * <tr><td rowspan='4'><i>Substitution</i></td>     *     <td>space for hyphen</td>     *     <td>-10</td></tr>     * <tr><td>digit for other digit</td>     *     <td>-10</td></tr>     * <tr><td>capital for lowercase</td>     *     <td>-10</td></tr>     * <tr><td>other characters</td>     *     <td>-50</td></tr>     *     * <tr><td><i>Match</i></td>     *     <td>any character</td>     *     <td>0</td></tr>     *     * <tr><td><i>Transposition</i></td>     *     <td>any characters</td>     *     <td>Double.NEGATIVE_INFINITY</td></tr>     *     * <tr><td colspan='3'><b>Tsuruoka and Tsujii's Weighted Edit Distance</b></td></tr>     * </table>     * </table>     * </blockquote>     *     * Tsuruoka and Tsujii's paper is available online:     *     * <blockquote> Yoshimasa Tsuruoka and Jun'ichi Tsujii.  2003. <a     * href="http://www-tsujii.is.s.u-tokyo.ac.jp/~tsuruoka/papers/acl03bio.pdf"     * >Boosting precision and recall of dictionary-based protein name     * recognition</a> In <i>Proceedings of the 2003 ACL workshop on     * NLP in Biomedicine</i>.     */    public static final WeightedEditDistance TT_DISTANCE = new TTDistance();    static final class TTDistance extends WeightedEditDistance {        public double deleteWeight(char cDeleted) {            return (cDeleted == ' ' || cDeleted == '-')                ? -10.0                : -100.0;        }        public double insertWeight(char cInserted) {            return deleteWeight(cInserted);        }        public double matchWeight(char cMatched) {            return 0.0;        }        public double substituteWeight(char cDeleted, char cInserted) {            if (cDeleted == ' ' && cInserted == '-')                return -10.0;            if (cDeleted == '-' && cInserted == ' ')                return -10.0;            if (Character.isDigit(cDeleted) && Character.isDigit(cInserted))                return -10.0;            if (Character.toLowerCase(cDeleted)                 == Character.toLowerCase(cInserted))                return -10.0;            return -50.0;        }        public double transposeWeight(char c1, char c2) {            return Double.NEGATIVE_INFINITY;        }    }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -