📄 approxdictionarychunker.java
字号:
} HashMap dpToChunk = new HashMap(); HashMap queue = new HashMap(); for (int i = 0; i < length; ++i) { int startPlusI = start + i; char c = cs[startPlusI]; if (startTokens[i]) { add(queue,mDictionary.mRootNode,startPlusI, 0.0, false,dpToChunk,cs,startPlusI); } HashMap nextQueue = new HashMap(); double deleteCost = -mEditDistance.deleteWeight(c); Iterator it = queue.values().iterator(); while (it.hasNext()) { SearchState state = (SearchState) it.next(); // delete add(nextQueue,state.mNode,state.mStartIndex, state.mScore + deleteCost, endTokens[i+1],dpToChunk,cs,startPlusI); // match or subst char[] dtrChars = state.mNode.mDtrChars; Node[] dtrNodes = state.mNode.mDtrNodes; for (int j = 0; j < dtrChars.length; ++j) { add(nextQueue,dtrNodes[j],state.mStartIndex, state.mScore - (dtrChars[j] == c ? mEditDistance.matchWeight(dtrChars[j]) : mEditDistance.substituteWeight(dtrChars[j],c)), endTokens[i+1],dpToChunk,cs,startPlusI); } } queue = nextQueue; } ChunkingImpl result = new ChunkingImpl(cs,start,end); Iterator it = dpToChunk.values().iterator(); while (it.hasNext()) result.add((Chunk)it.next()); return result; } void add(HashMap nextQueue, Node node, int startIndex, double chunkScore, boolean isTokenEnd, HashMap chunking, char[] cs, int end) { if (chunkScore > mDistanceThreshold) return; SearchState state2 = new SearchState(node,startIndex,chunkScore); SearchState exState = (SearchState) nextQueue.get(state2); if (exState != null && exState.mScore < chunkScore) return; nextQueue.put(state2,state2); // finish match at token end by adding each cat (may be 0) if (isTokenEnd) { for (int i = 0; i < node.mEntries.length; ++i) { Chunk newChunk = ChunkFactory .createChunk(startIndex,end+1, node.mEntries[i].category().toString(), chunkScore); Object dpNewChunk = new Dp(newChunk); Chunk oldChunk = (Chunk) chunking.get(dpNewChunk); if (oldChunk != null && oldChunk.score() <= chunkScore) continue; chunking.remove(dpNewChunk); chunking.put(dpNewChunk,newChunk); } } // insert for (int i = 0; i < node.mDtrChars.length; ++i) add(nextQueue,node.mDtrNodes[i],startIndex, chunkScore - mEditDistance.insertWeight(node.mDtrChars[i]), isTokenEnd,chunking,cs,end); } // chunk's data less score for efficient dynamic programming key static final class Dp { final int mStart; final int mEnd; final String mType; int mHashCode; Dp(Chunk chunk) { mStart = chunk.start(); mEnd = chunk.end(); mType = chunk.type(); mHashCode = mStart + 31 * (mEnd + 31 * mType.hashCode()); } public int hashCode() { return mHashCode; } public boolean equals(Object that) { Dp thatDp = (Dp) that; return mStart == thatDp.mStart && mEnd == thatDp.mEnd && mType.equals(thatDp.mType); } } static final class SearchState implements Scored { private final double mScore; private final Node mNode; private final int mStartIndex; // absolute in cs SearchState(Node node, int startIndex) { this(node,startIndex,0.0); } SearchState(Node node, int startIndex, double score) { mNode = node; mStartIndex = startIndex; mScore = score; } public double score() { return mScore; } public boolean equals(Object that) { SearchState thatState = (SearchState) that; return mStartIndex == thatState.mStartIndex && mNode == thatState.mNode; } public int hashCode() { return mStartIndex; // + 31 * mNode.hashCode(); } public String toString() { return "SearchState(" + mNode + ", " + mStartIndex + ", " + mScore + ")"; } } /** * This is a weighted edit distance defined by Tsuruoka and Tsujii * for matching protein names in biomedical texts. Reproducing * table 1 from their paper provides the weighting function * (converting slightly to our terminology and scale): * * <blockquote> * <table border='1' cellpadding='5'> * <tr><td><b><i>Operation</i></b></td> * <td><b><i>Character</i></b></td> * <td><b><i>Cost</i></b></td></tr> * * <tr><td rowspan='2'><i>Insertion</i></td> * <td>space or hyphen</td> * <td>-10</td></tr> * <tr><td>other characters</td> * <td>-100</td></tr> * * <tr><td rowspan='2'><i>Deletion</i></td> * <td>space or hyphen</td> * <td>-10</td></tr> * <tr><td>other characters</td> * <td>-100</td></tr> * * <tr><td rowspan='4'><i>Substitution</i></td> * <td>space for hyphen</td> * <td>-10</td></tr> * <tr><td>digit for other digit</td> * <td>-10</td></tr> * <tr><td>capital for lowercase</td> * <td>-10</td></tr> * <tr><td>other characters</td> * <td>-50</td></tr> * * <tr><td><i>Match</i></td> * <td>any character</td> * <td>0</td></tr> * * <tr><td><i>Transposition</i></td> * <td>any characters</td> * <td>Double.NEGATIVE_INFINITY</td></tr> * * <tr><td colspan='3'><b>Tsuruoka and Tsujii's Weighted Edit Distance</b></td></tr> * </table> * </table> * </blockquote> * * Tsuruoka and Tsujii's paper is available online: * * <blockquote> Yoshimasa Tsuruoka and Jun'ichi Tsujii. 2003. <a * href="http://www-tsujii.is.s.u-tokyo.ac.jp/~tsuruoka/papers/acl03bio.pdf" * >Boosting precision and recall of dictionary-based protein name * recognition</a> In <i>Proceedings of the 2003 ACL workshop on * NLP in Biomedicine</i>. */ public static final WeightedEditDistance TT_DISTANCE = new TTDistance(); static final class TTDistance extends WeightedEditDistance { public double deleteWeight(char cDeleted) { return (cDeleted == ' ' || cDeleted == '-') ? -10.0 : -100.0; } public double insertWeight(char cInserted) { return deleteWeight(cInserted); } public double matchWeight(char cMatched) { return 0.0; } public double substituteWeight(char cDeleted, char cInserted) { if (cDeleted == ' ' && cInserted == '-') return -10.0; if (cDeleted == '-' && cInserted == ' ') return -10.0; if (Character.isDigit(cDeleted) && Character.isDigit(cInserted)) return -10.0; if (Character.toLowerCase(cDeleted) == Character.toLowerCase(cInserted)) return -10.0; return -50.0; } public double transposeWeight(char c1, char c2) { return Double.NEGATIVE_INFINITY; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -