📄 chunkingevaluation.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.chunk;import com.aliasi.classify.PrecisionRecallEvaluation;import com.aliasi.util.Strings;import java.util.Collections;import java.util.HashSet;import java.util.Iterator;import java.util.Set;/** * A <code>ChunkingEvaluation</code> stores and reports the results of * evaluating response chunkings against reference chunkings.  Cases * to evaluate are supplied in the form of a reference and response * chunking through the method {@link #addCase(Chunking,Chunking)}. * * <P>The sets of true positive, * false positive and false negative chunks are available through the * methods {@link #truePositiveSet()}, {@link #falsePositiveSet()}, * and {@link #falseNegativeSet()}.  True positives are chunks that * are in both the reference and response, false positives are chunks * in the response but not the reference, and false negatives are in * the reference, but not the response.  There is no notion of true * negative in this task, a fact that is reflected in the results of * the precision-recall evaluation. * * <P>The main method of reporting is through an instance of {@link * com.aliasi.classify.ScoredPrecisionRecallEvaluation} returned by * the method {@link #precisionRecallEvaluation()}.  The return result * provides an object capable of extensive reporting for scored * classification tasks such as chunking.  The instances of true and * false positive and negatives are described above; their scores are * derived from response scores. *  * <P>This evaluator works solely on the basis of chunk offset and * exact match. There is no notion of alignment or mapping, as found, * for example, in the <a * href="http://www.itl.nist.gov/iaui/894.02/related_projects/muc/muc_sw/muc_sw_manual.html">MUC * Scoring Software User's Manual</a>, and its descendants such as the * <a * href="http://www.itl.nist.gov/iad/894.01/tests/ace/ace05/doc/ace05-evalplan.v2a.pdf">2005 * ACE Evaluation Plan</a>.  In this regard, we follow the model of * <a href="http://acl.ldc.upenn.edu/W/W00/W00-0726.pdf">CoNLL 2000 * Chunking Task</a>. * * <P>This evaluation is able to handle overlapping chunks with * results being reported in the same manner.  In particular, the * labeled precision and recall components of the approach that later * became known as <a * href="http://acl.ldc.upenn.edu/H/H91/H91-1060.pdf">PARSEVAL</a> can * be generated by using the <code>ChunkingEvaluation</code> class. *  * @author  Bob Carpenter * @version 3.0 * @since   LingPipe2.1 */public class ChunkingEvaluation {    private final Set<Chunking[]> mCases = new HashSet<Chunking[]>();    private final Set<ChunkAndCharSeq> mTruePositiveSet         = new HashSet<ChunkAndCharSeq>();    private final Set<ChunkAndCharSeq> mFalsePositiveSet        = new HashSet<ChunkAndCharSeq>();    private final Set<ChunkAndCharSeq> mFalseNegativeSet        = new HashSet<ChunkAndCharSeq>();    String mLastCase = null;    /**     * Construct a chunking evaluation.     */    public ChunkingEvaluation() {         /* do nothing */    }    /**     * Return the set of cases consisting of pairs of reference and     * response chunkings.  The elements of the set returned are of     * type <code>Chunking[]</code>, with the first element being the     * reference chunk and the second element being the response     * chunk.     *     * <P>The set returned is an unmodifiable view of the underlying     * set of cases and will change as cases are added to this     * evaluation.     *     * @return The set of cases.     */    public Set<Chunking[]> cases() {        return Collections.<Chunking[]>unmodifiableSet(mCases);    }    /**     * Returns a chunking evaluation which consists of the current     * chunking evaluation restricted to the specified type.  A     * new evaluation is constructed and populated with the same     * cases as this evaluation, but with the reference and response     * chunkings both restricted to only include answers of the     * specified type.     *      * @param chunkType Type of chunk to be evaluated.     * @return ChunkingEvaluation Evaluation for this type.     */    public ChunkingEvaluation perTypeEvaluation(String chunkType) {        ChunkingEvaluation evaluation = new ChunkingEvaluation();        Set cases = cases();        Iterator it = cases.iterator();        while (it.hasNext()) {            Chunking[] testCase = (Chunking[]) it.next();            Chunking referenceChunking = testCase[0];            Chunking responseChunking = testCase[1];            Chunking referenceChunkingRestricted                = restrictTo(referenceChunking,chunkType);            Chunking responseChunkingRestricted                = restrictTo(responseChunking,chunkType);            evaluation.addCase(referenceChunkingRestricted,                               responseChunkingRestricted);        }        return evaluation;    }    static Chunking restrictTo(Chunking chunking, String type) {        CharSequence cs = chunking.charSequence();        ChunkingImpl chunkingOut = new ChunkingImpl(cs);        for (Chunk chunk : chunking.chunkSet())            if (chunk.type().equals(type))                chunkingOut.add(chunk);        return chunkingOut;    }        static String formatChunks(Chunking chunking) {        StringBuffer sb = new StringBuffer();        Set chunkSet = chunking.chunkSet();        Iterator it = chunkSet.iterator();        int pos = 0;        while (it.hasNext()) {            Chunk chunk = (Chunk) it.next();            int start = chunk.start();            int padLength = start-pos;            for (int j = 0; j < padLength; ++j)                sb.append(" ");            int end = chunk.end();            int chunkLength = end-start;            char marker = chunk.type().length() > 0                ? chunk.type().charAt(0)                : '!';            if (chunkLength > 0) sb.append(marker);            for (int j = 1; j < chunkLength; ++j)                sb.append(".");            pos = end;        }        sb.append("\n");        return sb.toString();    }    static String formatHeader(int indent, Chunking chunking) {        String cs = chunking.charSequence().toString();        StringBuffer sb = new StringBuffer();        for (int i = 0; i < indent; ++i) sb.append(" ");        sb.append("CHUNKS= ");        Iterator it = chunking.chunkSet().iterator();        while (it.hasNext()) {            Chunk chunk = (Chunk) it.next();            sb.append("(" + chunk.start() + "," + chunk.end() + "):" + chunk.type() + "   ");        }        if (sb.charAt(sb.length()-1) != '\n') sb.append("\n");        for (int i = 0; i < indent; ++i) sb.append(" ");        sb.append(cs);        sb.append("\n");        int length = cs.length();        printMods(1,length, sb,indent);        printMods(10,length, sb,indent);        printMods(100,length, sb,indent);        if (sb.charAt(sb.length()-1) != '\n') sb.append("\n");        return sb.toString();    }    static void printMods(int base, int length, StringBuffer sb, int indent) {        if (length <= base) return;        for (int i = 0; i < indent; ++i) sb.append(" ");        for (int i = 0; i < length; ++i) {            if (base == 1 || (i >= base && i % 10 == 0))                sb.append(Integer.toString((i/base)%10));            else                sb.append(" ");        }        sb.append("\n");    }    /**     * Add an evaluation case consisting of a reference chunk     * set and a response chunk set.     *     * @param referenceChunking Chunking of reference chunks.     * @param responseChunking Chunking of response chunks.     * @throws IllegalArgumentException If the chunkings are not     * over the same character sequence.     */    public void addCase(Chunking referenceChunking,                         Chunking responseChunking) {        StringBuffer sb = new StringBuffer();            CharSequence cSeq = referenceChunking.charSequence();        if (!Strings.equalCharSequence(cSeq,                                       responseChunking.charSequence())) {            String msg = "Char sequences must be same."                + " Reference char seq=" + cSeq                + " Response char seq=" + responseChunking.charSequence();            throw new IllegalArgumentException(msg);        }        sb.append("\n");        sb.append(formatHeader(5,referenceChunking)); // 5 is indent for " REF " and "RESP "        sb.append("\n REF ");        sb.append(formatChunks(referenceChunking));        sb.append("RESP ");        sb.append(formatChunks(responseChunking));        sb.append("\n");        mLastCase = sb.toString();            mCases.add(new Chunking[] { referenceChunking, responseChunking });        // need mutable sets, so wrap        Set refSet = unscoredChunkSet(referenceChunking);        Set respSet = unscoredChunkSet(responseChunking);        Iterator it = respSet.iterator();        while (it.hasNext()) {            Chunk respChunk = (Chunk) it.next();            boolean inRef = refSet.remove(respChunk);            ChunkAndCharSeq ccs = new ChunkAndCharSeq(respChunk,cSeq);            if (inRef) {                mTruePositiveSet.add(ccs);            } else {                mFalsePositiveSet.add(ccs);            }        }        Iterator it2 = refSet.iterator();        while (it2.hasNext()) {            Chunk refChunk = (Chunk) it2.next();            mFalseNegativeSet.add(new ChunkAndCharSeq(refChunk,cSeq));        }    }    static Set unscoredChunkSet(Chunking chunking) {        HashSet result = new HashSet();        Iterator it = chunking.chunkSet().iterator();        while (it.hasNext()) {            Chunk chunk = (Chunk) it.next();            result.add(ChunkFactory.createChunk(chunk.start(),                                                chunk.end(),                                                chunk.type()));        }        return result;    }    /**     * Returns the set of true positives.  True positives are chunks     * that were in both a reference and response chunking case. The     * set returned contains instances of {@link ChunkAndCharSeq},     * which combine a chunk and a character sequence.     *     * <P> The set is unmodifiable, but tracks the changes in this     * evaluator.     *     * @return The set of true positives.     */    public Set<ChunkAndCharSeq> truePositiveSet() {        return Collections.<ChunkAndCharSeq>unmodifiableSet(mTruePositiveSet);    }    /**     * Returns the set of false positives.  False positives are     * response chunks that are not reference chunks. The set returned     * contains instances of {@link ChunkAndCharSeq}, which combine a     * chunk and a character sequence.     *     * <P> The set is unmodifiable, but tracks the changes in this     * evaluator.     *     * @return The set of false positives.     */    public Set<ChunkAndCharSeq> falsePositiveSet() {        return Collections.<ChunkAndCharSeq>unmodifiableSet(mFalsePositiveSet);    }    /**     * Returns the set of false negatives.  False negatives are     * reference chunks which are not response chunks.  The set     * returned contains instances of {@link ChunkAndCharSeq}, which     * combine a chunk and a character sequence.     *     * <P> The set is unmodifiable, but tracks the changes in this     * evaluator.     *     * @return The set of false negatives.     */    public Set<ChunkAndCharSeq> falseNegativeSet() {        return Collections.<ChunkAndCharSeq>unmodifiableSet(mFalseNegativeSet);    }    /**     * Return the scored precision-recall evaluation for this chunker.     * This is a copy of the precision-recall evaluation and changes to     * it will not affect the results returned by this class.     *     * @return The precision-recall evaluation.     */    public PrecisionRecallEvaluation precisionRecallEvaluation() {        int tp = truePositiveSet().size();        int fn = falseNegativeSet().size();        int fp = falsePositiveSet().size();        return new PrecisionRecallEvaluation(tp,fn,fp,0);    }    /**     * Returns the precision-recall evaluation for this chunking     * as a string.     *     * @return This evaluation as a string.     */    public String toString() {        return precisionRecallEvaluation().toString();    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -