documentviewer.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 239 行

JAVA
239
字号
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.base.extract;import edu.umass.cs.mallet.base.types.Label;import edu.umass.cs.mallet.base.types.LabelAlphabet;import edu.umass.cs.mallet.base.util.ColorUtils;import java.io.File;import java.io.PrintWriter;import java.io.FileWriter;import java.io.IOException;import java.util.ArrayList;import java.util.List;/** * Diagnosis class that outputs HTML pages that allows you to view errors on a more *  global per-instance basis. * * Created: Mar 30, 2005 * * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: DocumentViewer.java,v 1.2 2005/04/01 01:05:04 casutton Exp $ */public class DocumentViewer {  private static final String DOC_ERRS_CSS_FNAME = "docerrs.css";  private static final String DOC_ERRS_PRED_CSS_FNAME = "docerrs-by-pred.css";  private static final String DOC_ERRS_TRUE_CSS_FNAME = "docerrs-by-true.css";  private static final double SATURATION = 0.4;  private static class DualLabeledSpans {    DualLabeledSpans (LabeledSpans ls1, LabeledSpans ls2) {      ls = new LabeledSpans[] { ls1, ls2 };    }    private LabeledSpans[] ls;    int size () { return ls[0].size(); }    LabeledSpan get (int t, int i) { return ls[i].getLabeledSpan (t); }  }  /**   * Writes several HTML files describing a given extraction.  Each HTML file shows an entire   *  document, with the extracted fields color-coded.   * @param directory Directory to write files to   * @param extraction Extraction to describe   * @throws IOException   */  public static void writeExtraction (File directory, Extraction extraction) throws IOException  {    outputIndex (directory, extraction);    outputStylesheets (directory, extraction);    outputDocuments (directory, extraction);  }  private static void outputStylesheets (File directory, Extraction extraction) throws IOException  {    // ERRS css    PrintWriter out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_CSS_FNAME)));    out.println (".tf_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");    out.println (".class_legend { visibility: hidden; }");    out.println (".correct { background-color:#33FF33; }");    out.println (".wrong { background-color:pink }");    out.println (".true { background-color:#99CCFF; }");    out.println (".pred { background-color:#FFFF66 }");    out.close ();    //PRED css    LabelAlphabet dict = extraction.getLabelAlphabet ();    String[] fields = determineFieldNames (dict);    String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);    out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_PRED_CSS_FNAME)));    out.println (".class_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");    out.println (".tf_legend { visibility: hidden; }");    for (int i = 0; i < fields.length; i++) {      out.println (".pred_"+fields[i]+" { background-color:"+colors[i]+"; }");    }    out.close ();    //TRUE css    out = new PrintWriter (new FileWriter (new File (directory, DOC_ERRS_TRUE_CSS_FNAME)));    out.println (".class_legend { border-style: dashed; border-width: 2px; padding: 10px; padding-top: 0ex; float: right; margin:2em; }");    out.println (".tf_legend { visibility: hidden; }");    for (int i = 0; i < fields.length; i++) {      out.println (".true_"+fields[i]+" { background-color:"+colors[i]+"; }");    }    out.close ();  }  private static void outputDocuments (File directory, Extraction extraction) throws IOException  {    for (int i = 0; i < extraction.getNumDocuments (); i++) {      PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "extraction"+i+".html")));      outputOneDocument (out, extraction.getDocumentExtraction (i));      out.close ();    }  }  private static void outputOneDocument (PrintWriter out, DocumentExtraction docExtr)  {    String name = docExtr.getName ();    out.println ("<HTML><HEAD><TITLE>"+name+": Extraction from Document</TITLE>");    out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_CSS_FNAME+"\" title=\"Agreement\" />");    out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_PRED_CSS_FNAME+"\" title=\"Pred\" />");    out.println ("<LINK REL=\"stylesheet\" TYPE=\"text/css\" HREF=\""+DOC_ERRS_TRUE_CSS_FNAME+"\" title=\"True\" />");    out.println ("</HEAD><BODY>");    outputClassLegend (out, docExtr.getExtractedSpans ().getLabeledSpan (0).getLabel ().getLabelAlphabet ());    outputRightWrongLegend (out);    DualLabeledSpans spans = intersectSpans (docExtr);    for (int i = 0; i < spans.size(); i++) {      LabeledSpan predSpan = spans.get (i, 0);      LabeledSpan trueSpan = spans.get (i, 1);      Label predLabel = predSpan.getLabel ();      Label trueLabel = trueSpan.getLabel ();      boolean predNonBgrnd = !predSpan.isBackground ();      boolean trueNonBgrnd = !trueSpan.isBackground ();      boolean isBackground = !predNonBgrnd && !trueNonBgrnd;            String spanClass = null;      if (predNonBgrnd && trueNonBgrnd) {        if (predLabel == trueLabel) {          spanClass = "correct";        } else {          spanClass = "wrong";        }      } else if (predNonBgrnd) {        spanClass = "pred";      } else if (trueNonBgrnd) {        spanClass = "true";      }      if (!isBackground) out.print ("<SPAN CLASS=\"pred_"+predLabel+"\">");      if (!isBackground) out.print ("<SPAN CLASS=\"true_"+trueLabel+"\">");      if (spanClass != null) { out.print ("<SPAN CLASS=\""+spanClass+"\">"); }      String text = predSpan.getSpan ().getText ();      text = text.replaceAll ("<", "&lt;");      text = text.replaceAll ("\n", "\n<P>");      out.print (text);      if (spanClass != null) { out.print ("</SPAN>"); }      if (!isBackground) out.print ("</SPAN></SPAN>");      out.println ();    }    out.println ("</BODY></HTML>");  }  private static void outputRightWrongLegend (PrintWriter out)  {    out.println ("<DIV CLASS=\"tf_legend\"><B>LEGEND</B><BR>");    out.println ("<SPAN CLASS='correct'>Correct</SPAN><BR />");    out.println ("<SPAN CLASS='wrong'>Wrong</SPAN><BR />");    out.println ("<SPAN CLASS='true'>False Negative</SPAN> (True field but predicted background)<BR />");    out.println ("<SPAN CLASS='pred'>False Positive</SPAN> (True background but predicted field)<BR />");    out.println ("</DIV>");  }  private static void outputClassLegend (PrintWriter out, LabelAlphabet dict)  {    out.println ("<DIV CLASS=\"class_legend\">");    out.println ("<H4>LEGEND</H4>");    String[] fields = determineFieldNames (dict);    String[] colors = ColorUtils.rainbow (fields.length, (float) SATURATION, 1);    for (int i = 0; i < fields.length; i++) {      out.println ("<SPAN STYLE=\"background-color:"+colors[i]+"\">"+fields[i]+"</SPAN><BR />");    }    out.println ("</DIV>");  }  private static String[] determineFieldNames (LabelAlphabet dict)  {    List l = new ArrayList ();    for (int i = 0; i < dict.size (); i++) {      String lname = dict.lookupLabel (i).toString ();      if (!lname.startsWith ("B-") && !lname.startsWith ("I-")) {        l.add (lname);      }    }    return (String[]) l.toArray (new String [l.size ()]);  }  private static DualLabeledSpans intersectSpans (DocumentExtraction docExtr)  {    int predIdx = 0;    int trueIdx = 0;    LabeledSpans trueSpans = docExtr.getTargetSpans ();    LabeledSpans predSpans = docExtr.getExtractedSpans ();    LabeledSpans retPredSpans = new LabeledSpans (predSpans.getDocument ());    LabeledSpans retTrueSpans = new LabeledSpans (predSpans.getDocument ());    while ((predIdx < predSpans.size()) && (trueIdx < trueSpans.size ())) {      LabeledSpan predSpan = predSpans.getLabeledSpan (predIdx);      LabeledSpan trueSpan = trueSpans.getLabeledSpan (trueIdx);      LabeledSpan newPredSpan = (LabeledSpan) predSpan.intersection (trueSpan);      LabeledSpan newTrueSpan = (LabeledSpan) trueSpan.intersection (predSpan);      retPredSpans.add (newPredSpan);      retTrueSpans.add (newTrueSpan);      if (predSpan.getEndIdx () <= trueSpan.getEndIdx ()) {        predIdx++;      }      if (trueSpan.getEndIdx () <= predSpan.getEndIdx ()) {        trueIdx++;      }    }    assert (retPredSpans.size() == retTrueSpans.size());    return new DualLabeledSpans (retPredSpans, retTrueSpans);  }  private static void outputIndex (File directory, Extraction extraction) throws IOException  {    PrintWriter out = new PrintWriter (new FileWriter (new File (directory, "index.html")));    out.println ("<HTML><HEAD><TITLE>Extraction Results</TITLE></HEAD><BODY><OL>");    for (int i = 0; i < extraction.getNumDocuments(); i++) {      String name = extraction.getDocumentExtraction (i).getName ();      out.println ("  <LI><A HREF=\"extraction"+i+".html\">"+name+"</A></LI>");    }    out.println ("</OL></BODY></HTML>");    out.close ();  }}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?