📄 exactdictionarychunker.java
字号:
* @param end One past the index of the last character in the slice. * @return The chunking for the specified character slice. */ public Chunking chunk(char[] cs, int start, int end) { ChunkingImpl chunking = new ChunkingImpl(cs,start,end); if (mMaxPhraseLength == 0) return chunking; // no dict entries CircularQueueInt queue = new CircularQueueInt(mMaxPhraseLength); Tokenizer tokenizer = tokenizer(cs,start,end); int tokenStartPos = tokenizer.nextWhitespace().length(); TrieNode node = mTrieRootNode; String token; while ((token = tokenizer.nextToken()) != null) { queue.enqueue(tokenStartPos); int tokenEndPos = tokenStartPos + token.length(); while (true) { TrieNode daughterNode = node.getDaughter(token); if (daughterNode != null) { node = daughterNode; break; } if (node.mSuffixNode == null) { node = mTrieRootNode.getDaughter(token); if (node == null) node = mTrieRootNode; break; } node = node.mSuffixNode; } emit(node,queue,tokenEndPos,chunking); for (TrieNode suffixNode = node.mSuffixNodeWithCategory; suffixNode != null; suffixNode = suffixNode.mSuffixNodeWithCategory) { emit(suffixNode,queue,tokenEndPos,chunking); } tokenStartPos = tokenEndPos + tokenizer.nextWhitespace().length(); } return mReturnAllMatches ? chunking : restrictToLongest(chunking); } /** * Returns a string-based representation of this chunker. * The string includes the tokenizer factory's class name, whether * or not it returns all matches, whether or not it is case * sensitive, and also includes the entire trie underlying the * matcher, which is quite large for large dictionaries (multiple * lines per dictionary entry). * * @return String-based representation of this chunker. */ public String toString() { StringBuffer sb = new StringBuffer(); sb.append("ExactDictionaryChunker\n"); sb.append("Tokenizer factory=" + mTokenizerFactory.getClass() + "\n"); sb.append("(toString) mCaseSensitive=" + mCaseSensitive + "\n"); sb.append("Return all matches=" + mReturnAllMatches + "\n\n"); mTrieRootNode.toString(sb,0); return sb.toString(); } void emit(TrieNode node, CircularQueueInt queue, int end, ChunkingImpl chunking) { ScoredCat[] scoredCats = node.mCategories; for (int i = 0; i < scoredCats.length; ++i) { int start = queue.get(node.depth()); String type = scoredCats[i].mCat; double score = scoredCats[i].mScore; Chunk chunk = ChunkFactory.createChunk(start,end,type,score); chunking.add(chunk); } } final TrieNode compileTrie(Dictionary<String> dict) { TrieNode rootNode = new TrieNode(0); Iterator<DictionaryEntry<String>> it = dict.iterator(); while (it.hasNext()) { DictionaryEntry<String> entry = it.next(); String phrase = entry.phrase(); char[] cs = phrase.toCharArray(); Tokenizer tokenizer = tokenizer(cs,0,cs.length); int length = rootNode.add(tokenizer,entry); if (length > mMaxPhraseLength) mMaxPhraseLength = length; } computeSuffixes(rootNode,rootNode,new String[mMaxPhraseLength],0); return rootNode; } final void computeSuffixes(TrieNode node, TrieNode rootNode, String[] tokens, int length) { for (int i = 1; i < length; ++i) { TrieNode suffixNode = rootNode.getDaughter(tokens,i,length); if (suffixNode == null) continue; node.mSuffixNode = suffixNode; break; } // second loop could start where first left off for (int i = 1; i < length; ++i) { TrieNode suffixNode = rootNode.getDaughter(tokens,i,length); if (suffixNode == null) continue; if (suffixNode.mCategories.length == 0) continue; node.mSuffixNodeWithCategory = suffixNode; break; } if (node.mDaughterMap == null) return; Iterator edgeIt = node.mDaughterMap.entrySet().iterator(); while (edgeIt.hasNext()) { Map.Entry entry = (Map.Entry) edgeIt.next(); tokens[length] = entry.getKey().toString(); TrieNode dtrNode = (TrieNode) entry.getValue(); computeSuffixes(dtrNode,rootNode,tokens,length+1); } } static Chunking restrictToLongest(Chunking chunking) { ChunkingImpl result = new ChunkingImpl(chunking.charSequence()); Set chunkSet = chunking.chunkSet(); if (chunkSet.size() == 0) return chunking; Chunk[] chunks = new Chunk[chunkSet.size()]; chunkSet.toArray(chunks); Arrays.sort(chunks,Chunk.LONGEST_MATCH_ORDER_COMPARATOR); int lastEnd = -1; for (int i = 0; i < chunks.length; ++i) { if (chunks[i].start() >= lastEnd) { result.add(chunks[i]); lastEnd = chunks[i].end(); } } return result; } static class ScoredCat implements Scored { String mCat; double mScore; ScoredCat(String cat, double score) { mCat = cat; mScore = score; } public double score() { return mScore; } public String toString() { return mCat + ":" + mScore; } } static ScoredCat[] EMPTY_SCORED_CATS = new ScoredCat[0]; static class TrieNode { int mDepth; HashMap mDaughterMap = null; ScoredCat[] mCategories = EMPTY_SCORED_CATS; TrieNode mSuffixNode; TrieNode mSuffixNodeWithCategory; TrieNode(int depth) { mDepth = depth; } public int depth() { return mDepth; } public void addEntry(DictionaryEntry entry) { // should just do this with a merge; tighter with parallel arrays ScoredCat[] newCats = new ScoredCat[mCategories.length+1]; System.arraycopy(mCategories,0,newCats,0,mCategories.length); newCats[newCats.length-1] = new ScoredCat(entry.category().toString(),entry.score()); Arrays.sort(newCats,ScoredObject.REVERSE_SCORE_COMPARATOR); mCategories = newCats; } public TrieNode getDaughter(String[] tokens, int start, int end) { TrieNode node = this; for (int i = start; i < end && node != null; ++i) node = node.getDaughter(tokens[i]); return node; } public TrieNode getDaughter(String token) { return mDaughterMap == null ? null : (TrieNode) mDaughterMap.get(token); } public TrieNode getOrCreateDaughter(String token) { TrieNode existingDaughter = getDaughter(token); if (existingDaughter != null) return existingDaughter; TrieNode newDaughter = new TrieNode(depth()+1); if (mDaughterMap == null) mDaughterMap = new HashMap(2); mDaughterMap.put(token,newDaughter); return newDaughter; } public int add(Tokenizer tokenizer, DictionaryEntry entry) { TrieNode node = this; String token; while ((token = tokenizer.nextToken()) != null) node = node.getOrCreateDaughter(token); node.addEntry(entry); return node.depth(); } public String toString() { StringBuffer sb = new StringBuffer(); toString(sb,0); return sb.toString(); } String id() { return mDepth + ":" + Integer.toHexString(hashCode()); } void toString(StringBuffer sb, int depth) { indent(sb,depth); sb.append(id()); for (int i = 0; i < mCategories.length; ++i) { indent(sb,depth); sb.append("cat " + i + "=" + mCategories[i]); } if (mSuffixNode != null) { indent(sb,depth); sb.append("suffixNode="); sb.append(mSuffixNode.id()); } if (mSuffixNodeWithCategory != null) { indent(sb,depth); sb.append("suffixNodeWithCat="); sb.append(mSuffixNodeWithCategory.id()); } if (mDaughterMap == null) return; Iterator it = mDaughterMap.keySet().iterator(); while (it.hasNext()) { String token = it.next().toString(); indent(sb,depth); sb.append(token); getDaughter(token).toString(sb,depth+1); } } static void indent(StringBuffer sb, int depth) { sb.append("\n"); for (int i = 0; i < depth; ++i) sb.append(" "); } } static class CircularQueueInt { final int[] mQueue; int mNextPos = 0; public CircularQueueInt(int size) { mQueue = new int[size]; Arrays.fill(mQueue,0); } public void enqueue(int val) { mQueue[mNextPos] = val; if (++mNextPos == mQueue.length) mNextPos = 0; } public int get(int offset) { // should have: -mQueue.length < offset <= 0 int pos = mNextPos - offset; if (pos < 0) pos += mQueue.length; return mQueue[pos]; } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -