📄 sentenceevaluation.java

📁 一个自然语言处理的Java开源工具包。LingPipe目前已有很丰富的功能
💻 JAVA
字号:
/* * LingPipe v. 3.5 * Copyright (C) 2003-2008 Alias-i * * This program is licensed under the Alias-i Royalty Free License * Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i * Royalty Free License Version 1 for more details. * * You should have received a copy of the Alias-i Royalty Free License * Version 1 along with this program; if not, visit * http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact * Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211, * +1 (718) 290-9170. */package com.aliasi.sentences;import com.aliasi.classify.PrecisionRecallEvaluation;import com.aliasi.chunk.Chunk;import com.aliasi.chunk.ChunkAndCharSeq;import com.aliasi.chunk.Chunking;import com.aliasi.chunk.ChunkingEvaluation;import com.aliasi.util.Strings;import java.util.Collections;import java.util.HashSet;import java.util.HashMap;import java.util.Iterator;import java.util.Set;/** * A <code>SentenceEvaluation</code> stores and reports the results of * evaluating a set of reference sentence chunkings and response * sentence chunkings. Evaluation results are available through the * method {@link #chunkingEvaluation()}, which returns the evaluation * of the sentences as chunkings, and the method {@link * #endBoundaryEvaluation()}, which returns the evaluation of the * sentences chunkings solely on the basis of the end boundary index of the sentence. * * @author  Bob Carpenter * @version 3.0 * @since   LingPipe2.1 */public class SentenceEvaluation {    private final ChunkingEvaluation mChunkingEvaluation;    private final PrecisionRecallEvaluation mEndBoundaryEvaluation;    private final Set<ChunkAndCharSeq> mTPBoundaries = new HashSet();    private final Set<ChunkAndCharSeq> mFPBoundaries = new HashSet();    private final Set<ChunkAndCharSeq> mFNBoundaries = new HashSet();    private final Chunking[] mLastCase = new Chunking[2];    /**     * Construct a sentence evaluation.     */    public SentenceEvaluation() {        mChunkingEvaluation = new ChunkingEvaluation();        mEndBoundaryEvaluation = new PrecisionRecallEvaluation();    }    /**     * Add the case corresponding to the specified reference and     * response chunkings.  The chunkings should only contain sentence     * chunks with chunk type {@link     * SentenceChunker#SENTENCE_CHUNK_TYPE}.     *     * @param referenceChunking The reference chunking.     * @param responseChunking The response chunking.     * @throws IllegalArgumentException If the reference chunking and     * response chunking are not over the same characters or if either     * contains chunks of type other than     * <code>SentenceChunker.SENTENCE_CHUNK_TYPE</code>.     */    public void addCase(Chunking referenceChunking, Chunking responseChunking) {        if (!Strings.equalCharSequence(referenceChunking.charSequence(),                                       responseChunking.charSequence())) {            String msg = "Underlying char sequences must have same characters."                + " Found referenceChunking.charSequence()="                 + referenceChunking.charSequence()                + " responseChunking.charSequence()="                + responseChunking.charSequence();            throw new IllegalArgumentException(msg);        }        verifySentenceTypes("reference",referenceChunking);        verifySentenceTypes("response",responseChunking);        mChunkingEvaluation.addCase(referenceChunking,responseChunking);        mLastCase[0] = referenceChunking;        mLastCase[1] = responseChunking;        // evaluate chunkings solely on basis of end indices.        HashMap endChunkMap = new HashMap();        CharSequence cSeq = referenceChunking.charSequence();        Iterator it = referenceChunking.chunkSet().iterator();        while (it.hasNext()) {            Chunk refChunk = (Chunk) it.next();            Integer end = new Integer(refChunk.end());            endChunkMap.put(end,refChunk);        }        Iterator it2 = responseChunking.chunkSet().iterator();        while (it2.hasNext()) {            Chunk respChunk = (Chunk) it2.next();            Integer end = new Integer(respChunk.end());            boolean inRef = endChunkMap.containsKey(end);            ChunkAndCharSeq ccs = new ChunkAndCharSeq(respChunk,cSeq);            if (inRef) {                mTPBoundaries.add(ccs);                mEndBoundaryEvaluation.addCase(true,true);                endChunkMap.remove(end);            } else {                mFPBoundaries.add(ccs);                mEndBoundaryEvaluation.addCase(false,true);            }        }        //    Vector falseNegatives = endChunkMap.values();        Iterator it3 = endChunkMap.values().iterator();        while (it3.hasNext()) {            Chunk refChunk = (Chunk) it3.next();            mFNBoundaries.add(new ChunkAndCharSeq(refChunk,cSeq));            mEndBoundaryEvaluation.addCase(true,false);        }    }        /**     * Return the chunking evaluation for this sentence evaluation.     * This is the evaluation based purely on the chunks and their     * matching, not on any sentence-specific evaluation     *     * @return The chunking evaluation for this sentence evaluation.     */    public ChunkingEvaluation chunkingEvaluation() {        return mChunkingEvaluation;    }    /**     * Returns a precision/recall evaluation based on the end     * boundaries in the reference and response cases.  End boundaries     * in the response and the reference are true positives, those in     * the response but not the reference are false positives, and those     * in the reference but not the response are false negatives.     */    public PrecisionRecallEvaluation endBoundaryEvaluation() {        return mEndBoundaryEvaluation;    }    /**     * Return the set of chunks and character sequences whose end     * boundaries are in both the reference and response.     *     * @return The set of true-positive end-boundary chunks.     */    public Set<ChunkAndCharSeq> truePositiveEndBoundaries() {        return Collections.<ChunkAndCharSeq>unmodifiableSet(mTPBoundaries);    }    /**     * Return the set of chunks and character sequences whose end     * boundaries are in response but not the reference chunking.     *     * @return The set of false-positive end-boundary chunks.     */    public Set<ChunkAndCharSeq> falsePositiveEndBoundaries() {        return Collections.<ChunkAndCharSeq>unmodifiableSet(mFPBoundaries);    }    /**     * Return the set of chunks and character sequences whose end     * boundaries are in reference but not the response chunking.     *     * @return The set of false-negative end-boundary chunks.     */    public Set<ChunkAndCharSeq> falseNegativeEndBoundaries() {        return Collections.<ChunkAndCharSeq>unmodifiableSet(mFNBoundaries);    }    static void verifySentenceTypes(String input, Chunking chunking) {        Iterator it = chunking.chunkSet().iterator();        while (it.hasNext()) {            Chunk chunk = (Chunk) it.next();            if (!chunk.type().equals(SentenceChunker.SENTENCE_CHUNK_TYPE)) {                String msg = "Chunk must have sentence type."                    + " Found type=" + chunk.type();                throw new IllegalArgumentException(msg);            }        }    }    /**     * Returns a formatted listing for the last case that was     * evaluated, showing the underlying character sequence for the     * chunkings, annotated with the sentence boundaries from the     * reference and response chunking, with linebreaks inserted every     * lineLength characters.  The lineLength must be a positive     * integer.     *     * @param lineLength The line length of the formatted string.       * @throws IllegalArgumentException If the lineLength is < 1.     * @throws IllegalStateException If no cases have been evaluated.     */    public String lastCaseToString(int lineLength) {        if (lineLength < 1) {            String msg = "Line length must be greater than 0.";            throw new IllegalArgumentException(msg);        }        if (mLastCase[0] == null || mLastCase[1] == null) {            String msg = "No cases have been evaluated.";            throw new IllegalStateException(msg);        }        return sentenceCaseToString(mLastCase[0],mLastCase[1],lineLength);    }    /**      * Given a pair of reference and response chunkings, returns a     * string showing the underlying character sequence     * chunking pair, annotated with the      * sentence boundaries from each chunking, with linebreaks      * inserted every lineLength characters.  The chunkings should      * only contain sentence chunks with chunk type {@link      * SentenceChunker#SENTENCE_CHUNK_TYPE}. The lineLength must be a     * positive integer.     *     * @param referenceChunking The reference chunking.     * @param responseChunking The response chunking.     * @param lineLength The line length of the formatted string.       * @throws IllegalArgumentException If the reference chunking and     * response chunking are not over the same characters or if either     * contains chunks of type other than     * <code>SentenceChunker.SENTENCE_CHUNK_TYPE</code>, or if the     * lineLength is < 1.     */    public static String sentenceCaseToString(Chunking referenceChunking, Chunking responseChunking, int lineLength) {        if (lineLength < 1) {            String msg = "Line length must be greater than 0.";            throw new IllegalArgumentException(msg);        }        if (!Strings.equalCharSequence(referenceChunking.charSequence(),                                       responseChunking.charSequence())) {            String msg = "Underlying char sequences must have same characters."                + " Found referenceChunking.charSequence()="                 + referenceChunking.charSequence()                + " responseChunking.charSequence()="                + responseChunking.charSequence();            throw new IllegalArgumentException(msg);        }        SentenceEvaluation.verifySentenceTypes("reference",referenceChunking);        SentenceEvaluation.verifySentenceTypes("response",responseChunking);        CharSequence cSeq = referenceChunking.charSequence();        int[] refEnds = new int[referenceChunking.chunkSet().size()];        int iRef = 0;        Iterator it = referenceChunking.chunkSet().iterator();        while (it.hasNext()) {            Chunk refChunk = (Chunk) it.next();            refEnds[iRef++]= refChunk.end()-1;        }        int[] respEnds = new int[responseChunking.chunkSet().size()];        int iResp = 0;        Iterator it2 = responseChunking.chunkSet().iterator();        while (it2.hasNext()) {            Chunk respChunk = (Chunk) it2.next();            respEnds[iResp++]= respChunk.end()-1;        }        StringBuffer sbOut = new StringBuffer();        StringBuffer refLine =  new StringBuffer();        StringBuffer textLine = new StringBuffer();        StringBuffer respLine = new StringBuffer();        refLine.append("ref:  ");        textLine.append("text: ");        respLine.append("resp: ");        int cLen = cSeq.length();        int refIndex = 0;        int respIndex = 0;        for (int i = 0; i < cLen; i++) {            textLine.append(cSeq.charAt(i));            if (refIndex < refEnds.length && respIndex < respEnds.length                && refEnds[refIndex]==i && respEnds[respIndex]==i ) {                refLine.append("+");                respLine.append("+");                refIndex++;                respIndex++;            }            else if (refIndex < refEnds.length && refEnds[refIndex]==i) {                refLine.append("X");                respLine.append("-");                refIndex++;            }            else if (respIndex < respEnds.length && respEnds[respIndex]==i) {                refLine.append("-");                respLine.append("X");                respIndex++;            }            else {                refLine.append("-");                respLine.append("-");            }            if (i > 0 && i%lineLength == 0) {                sbOut.append(refLine+"\n");                sbOut.append(textLine+"\n");                sbOut.append(respLine+"\n");                sbOut.append("\n");                refLine.setLength(0);                textLine.setLength(0);                respLine.setLength(0);                refLine.append("ref:  ");                textLine.append("text: ");                respLine.append("resp: ");            }        }        sbOut.append(refLine+"\n");        sbOut.append(textLine+"\n");        sbOut.append(respLine+"\n");        sbOut.append("\n\n");        return sbOut.toString();    }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -