📄 approxdictionarychunker.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.dict;import com.aliasi.chunk.Chunk;import com.aliasi.chunk.ChunkFactory;import com.aliasi.chunk.Chunking;import com.aliasi.chunk.ChunkingImpl;import com.aliasi.chunk.Chunker;import com.aliasi.tokenizer.Tokenizer;import com.aliasi.tokenizer.TokenizerFactory;import com.aliasi.spell.WeightedEditDistance;import com.aliasi.util.Scored;import com.aliasi.util.Strings;import java.util.Arrays;import java.util.HashMap;import java.util.Iterator;/** * An <code>ApproxDictionaryChunker</code> implements a chunker that * produces chunks based on weighted edit distance of strings from * dictionary entries.  This is an approximate or &quot;fuzzy&quot; * dictionary matching strategy. * * <P>The underlying dictionary is required to be an instance of * {@link TrieDictionary} in order to support efficient search for * matches.  Other dictionaries can be easily converted to * trie dictionaries by adding their entries to a fresh trie * dictionary. * * <P>Entries are matched by weighted edit distance, as supplied by an * implementation of {@link WeightedEditDistance}.  All substrings * within the maximum distance specified at construction time are * returned as part of the chunking.  Keep in mind that weights for * weighted edit distance are specified as proximities, that is, as * negative distances. * * <h4>No Transposition</h4> * * <p>Transposition is not implemented in the approximate dictionary * chunker, so no matches are possible through * transposition. Specifically, the transpose weight method is never * called on the underlying weighted edit distance. * * <h4>Token Sensitivity</h4> *  * <P>The tokenizer factory supplied at construction time is * only used to constrain search by enforcing boundary conditions. * Chunks are only returned if they start on the first character * of a token and end on the last character of a token.   *  * <p>Using an instance of {@link * com.aliasi.tokenizer.CharacterTokenizerFactory} effectively removes * token sensitivity by treating every non-whitespace character as a * token and thus rendering every non-whitespace position a possible * chunk boundary. * * <h4>References</h4> * * <P>The approach implemented here is very similar to that described * in the following paper: * * <ul>  * * <li> Yoshimasa Tsuruoka and Jun'ichi Tsujii.  2003. <a * href="http://www-tsujii.is.s.u-tokyo.ac.jp/~tsuruoka/papers/acl03bio.pdf" * >Boosting precision and recall of dictionary-based protein name * recognition</a> In <i>Proceedings of the 2003 ACL workshop on NLP * in Biomedicine</i>. * </li> * </ul> * * The best general reference for approximate string matching * is: * * <ul> * <li> * Gusfield, Dan.  1997.  <i>Algorithms on Strings, Trees and Sequences</i>. * Cambridge University Press. * </li> * </ul> *  * @author Bob Carpenter * @version 3.3.1 * @since   LingPipe2.1 */public class ApproxDictionaryChunker implements Chunker {    private final TrieDictionary<String> mDictionary;    private final TokenizerFactory mTokenizerFactory;    private final WeightedEditDistance mEditDistance;    private double mDistanceThreshold;        /**     * Construct an approximate dictionary chunker from the specified     * dictionary, tokenizer factory, weighted edit distance and     * distance bound.  The dictionary is used for the candidate     * matches.  The tokenizer factory is used for determining     * possible boundaries of matches, which must start on the first     * character of a token and end on the last character of a token.     * The edit distance is used for measuring substrings against     * dictionary entries.  The distance threshold specifies the     * maximum distance at which matches are returned.     *     * @param dictionary Dictionary to use for matching.     * @param tokenizerFactory Tokenizer factory for boundary     * determination.     * @param editDistance Matching distance measure.     * @param distanceThreshold Distance threshold for matching.     */    public ApproxDictionaryChunker(TrieDictionary<String> dictionary,                                   TokenizerFactory tokenizerFactory,                                   WeightedEditDistance editDistance,                                   double distanceThreshold) {        mDictionary = dictionary;        mTokenizerFactory = tokenizerFactory;        mEditDistance = editDistance;        mDistanceThreshold = distanceThreshold;    }    /**     * Returns the trie dictionary underlying this chunker.     * This is the actual dictionary used by the chunker, so changes     * to it will affect this chunker.     *      * @return The trie dictionary underlying this chunker.     */    public TrieDictionary<String> dictionary() {        return mDictionary;    }    /**     * Returns the weighted edit distance for matching with     * this chunker.  This is the actual edit distance used by     * the chunker, so changes to it will affect this chunker.     *     * @return The weighted edit distance for this chunker.     */    public WeightedEditDistance editDistance() {        return mEditDistance;    }    /**     * Returns the tokenizer factory for matching with this     * chunker.  This is the actual tokenizer factory used     * by this chunker, so changes to it will affect the     * behavior of this class.     *     * @return The tokenizer factory for this chunker.     */    public TokenizerFactory tokenizerFactory() {        return mTokenizerFactory;    }    /**     * Returns the maximum edit distance a string can be from a     * dictionary entry in order to be returned by this chunker.  This     * value is set using {@link #setMaxDistance(double)}.     *      * @return The maximum edit distance for this chunker.     */    public double distanceThreshold() {        return mDistanceThreshold;    }    /**     * Set the max distance a string can be from a dictionary entry     * in order to be returned as a chunk by this chunker.     */    public void setMaxDistance(double distanceThreshold) {        mDistanceThreshold = distanceThreshold;    }    /**     * Return the approximate dictionary-based chunking for     * the specified character sequence.     *     * @param cSeq Character sequence to chunk.     * @return Chunking of the specified character sequence.     */    public Chunking chunk(CharSequence cSeq) {        char[] cs = Strings.toCharArray(cSeq);        return chunk(cs,0,cs.length);    }    /**     * Return the approximate dictionary-based chunking for the     * specified character sequence.     *     * @param cs Underlying characters.     * @param start Index of first character in the array.     * @param end Index of one past the last character in the array.     * @return Chunking of the specified character sequence.     * @throws IllegalArgumentException If the indices are out of     * bounds in the character sequence.     */    public Chunking chunk(char[] cs, int start, int end) {        int length = end-start;        // token start/ends setup; throws exception if args wrong        Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,start,length);        boolean[] startTokens = new boolean[length];        boolean[] endTokens = new boolean[length+1];        Arrays.fill(startTokens,false);        Arrays.fill(endTokens,false);        String token;        while ((token = tokenizer.nextToken()) != null) {            int lastStart = tokenizer.lastTokenStartPosition();            startTokens[lastStart] = true;            endTokens[lastStart + token.length()] = true;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -