📄 ieinterface3.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. *//** @author Fuchun Peng <a href="mailto:fuchun@cs.umass.edu">fuchun@cs.umass.edu</a> July 2003 This class provides information extraction interface to other applications */package edu.umass.cs.mallet.projects.seg_plus_coref.ie;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import edu.umass.cs.mallet.base.util.*;import junit.framework.*;import java.util.Iterator;import java.util.Random;import java.util.regex.*;import java.io.*;import java.util.logging.*;import java.util.ArrayList;public class IEInterface3{ String seperator = ""; private static Logger logger = Logger.getLogger(IEInterface3.class.getName()); private File crfFile; private CRF3 crf = null; private SerialPipes pipe; private TokenSequence tokenSequence; private Sequence viterbiSequence; private double confidence; private Transducer.ViterbiPath viterbiP; private Transducer.ViterbiPath_NBest viterbiP_NBest; private int instance_error_num = 0; private int instance_size = 0; private double instance_accuracy; boolean printFont = true; public IEInterface3() { this.crfFile = null; } public IEInterface3(File crfFile) { assert(crfFile != null); this.crfFile = crfFile; } public void setPipe(SerialPipes pipe) { this.pipe = pipe; } // load in CRF3 and its pipe from a trained crfFile public boolean loadCRF() { return loadCRF(crfFile); } public boolean loadCRF(File crfFile) { assert(crfFile != null); CRF3 crf = null; try { ObjectInputStream ois = new ObjectInputStream(new FileInputStream( crfFile )); crf = (CRF3) ois.readObject(); ois.close(); } catch (IOException e) { System.err.println("Exception reading crf file: " + e); crf= null; } catch (ClassNotFoundException cnfe) { System.err.println("Cound not find class reading in object: " + cnfe); crf= null; }// crf = CRFIO.readCRF(crfFile.toString()); if(crf==null) { System.err.println("Read a null crf from file: " + crfFile); System.exit(1); } this.crf = crf; this.pipe = (SerialPipes) crf.getInputPipe(); if (this.pipe == null) { System.err.println("Get a null pipe from CRF"); System.exit(1); } //xxx print out the read-in pipes, just for debugging purpose/* ArrayList pipes1 = (this.pipe).getPipes(); System.out.println("pipes1"); for (int i = 0; i < pipes1.size(); i++) { System.out.print("Pipe: " + i + ": "); Pipe tempP = (Pipe) pipes1.get (i); if (tempP == null) { System.out.println("Pipe is null"); } else { String pipeName = tempP.getClass().getName(); System.out.println(pipeName); if(tempP instanceof SerialPipes){ ArrayList pipes2 = ((SerialPipes)tempP).getPipes(); for(int j=0; j<pipes2.size(); j++){ System.out.print(" Pipe: " + j + ": "); Pipe tempP2 = (Pipe) pipes2.get(j); if(tempP2 == null){ System.out.println(" Pipe is null"); } else{ String pipeName2 = tempP2.getClass().getName(); System.out.println(pipeName2); } } } } }*/// System.out.println("================= start of CRF ============");// crf.print();// System.out.println("==================end of crf =============="); //xxx logger.log(Level.INFO, "Load CRF successfully\n"); return true; } public boolean loadCRF(CRF3 crf) { this.crf = crf; this.pipe = (SerialPipes) crf.getInputPipe(); if (this.pipe == null) { System.err.println("Get a null pipe from CRF"); return false; } return true; } public String printResultInFormat(boolean sgml) { String viterbiStr = ""; assert(tokenSequence != null); assert(viterbiSequence != null); assert(tokenSequence.size() == viterbiSequence.size()); String font = ""; String current_font = ""; if(sgml){ String old_tag = null; String startTag, endTag; for(int i=0; i<tokenSequence.size(); i++){ Token token = (Token)tokenSequence.getToken(i); String word = token.getText(); String tag = viterbiSequence.get(i).toString(); if(tag != old_tag){ if(old_tag != null){ endTag = "</"+old_tag+">"; viterbiStr += endTag; } startTag = "<"+tag+">"; viterbiStr += startTag; old_tag = tag; } if(token.hasProperty("FONT")){ current_font = (String)token.getProperty("FONT"); } if(!current_font.equals(font) && printFont){ viterbiStr += "<font value=\""+current_font+"\" />"; font = current_font; } viterbiStr += word; viterbiStr += " "; if(i == tokenSequence.size() - 1){ endTag = "</"+tag+">"; viterbiStr += endTag; } if(token.hasProperty("LINE_END")){ viterbiStr += "\n"; } } } else{ for(int i=0; i<tokenSequence.size(); i++){ viterbiStr += ((Token)tokenSequence.getToken(i)).getText(); viterbiStr += ": "; viterbiStr += viterbiSequence.get(i).toString(); viterbiStr += "\n"; } } return viterbiStr; } //given an input string, label it, and output in the format of inline SGML public String viterbiCRFString(String line, boolean sgml) { Instance lineCarrier = new Instance(line, null, null, null, pipe); assert(pipe != null); Instance featureCarrier = pipe.pipe(lineCarrier, 0); assert(crf != null); viterbiP = crf.viterbiPath((Sequence)featureCarrier.getData()); viterbiSequence = viterbiP.output(); //confidence = Math.exp(-viterbiP.getCost()/viterbiSequence.size()); confidence = viterbiP.getCost(); tokenSequence = (TokenSequence)featureCarrier.getSource(); assert(viterbiSequence.size() == tokenSequence.size()); return printResultInFormat(sgml); } // to use this method successfully, tokenization should use "\\w+-\\w+|\\w+|'s|``|''|\\S" pattern // or change the wordPattern in WSJPOSSentence2TokenSequence to match your tokenization pattern. // public Sequence viterbiCRFTokenSequence(TokenSequence ts) { assert(crf != null); String line = ""; for(int i=0; i<ts.size(); i++){ line += ts.getToken(i).getText()+" ";// System.out.println(i+": "+ts.getToken(i).getText()); } assert(pipe != null); Instance lineCarrier = new Instance(line, null, null, null, pipe); viterbiP = crf.viterbiPath((Sequence)lineCarrier.getData()); viterbiSequence = viterbiP.output(); confidence = Math.exp(-viterbiP.getCost()/viterbiSequence.size());// viterbiSequence = crf.viterbiPath((Sequence)lineCarrier.getData()).output();// Sequence tempTS = (Sequence)lineCarrier.getData();// for(int i=0; i<tempTS.size(); i++){// System.out.println(i+": "+tempTS.get(i).toString() + "/" + viterbiSequence.get(i).toString());// } assert(viterbiSequence.size() == ts.size()): "ts.size=" + ts.size() + " " + "viterSequence.size=" + viterbiSequence.size(); return viterbiSequence; } private double InstanceAccuracy(Sequence viterbiSequence, Sequence targetSequence) { assert(viterbiSequence.size() == targetSequence.size()); instance_size = viterbiSequence.size(); instance_error_num = 0; for(int i=0; i<instance_size; i++){ String predO = viterbiSequence.get(i).toString(); String trueO = targetSequence.get(i).toString(); if(!predO.equals(trueO)){ instance_error_num ++; } } double accuracy = (double)instance_error_num/instance_size; return accuracy; } //viterbi for a piped instance public String viterbiCRFInstance(Instance instance, boolean sgml ) { assert(crf != null); viterbiP = crf.viterbiPath((Sequence)instance.getData());// regular viterbi viterbiSequence = viterbiP.output();// confidence = Math.exp(viterbiP.getCost()/viterbiSequence.size());// confidence = viterbiP.getCost()/viterbiSequence.size(); // viterbiSequence = crf.viterbiPath((Sequence)instance.getData()).output(); instance_accuracy= InstanceAccuracy(viterbiSequence, (Sequence)instance.getTarget()); tokenSequence = (TokenSequence)instance.getSource(); assert(viterbiSequence.size() == tokenSequence.size()); return printResultInFormat(sgml); } public String viterbiCRFInstance_NBest(Instance instance, boolean sgml ) { String str = ""; assert(crf != null); tokenSequence = (TokenSequence)instance.getSource(); assert(viterbiSequence.size() == tokenSequence.size()); int N = 1; viterbiP_NBest = crf.viterbiPath_NBest((Sequence)instance.getData(), N);//n-best list Sequence[] nbestlist = viterbiP_NBest.outputNBest();/* for(int i=0; i<nbestlist.length; i++) { viterbiSequence = nbestlist[i]; // viterbiSequence = viterbiP_NBest.output(); str += "\n" + i + ": " + (viterbiP_NBest.costNBest())[i] + " : " + viterbiP_NBest.getCost() + "\n"; str += printResultInFormat(sgml); }*/ viterbiSequence = nbestlist[N-1]; str += printResultInFormat(sgml); return str; } //given an input file, label it, and output in the format of inline SGML public void viterbiCRF(File inputFile, boolean sgml, String seperator) { assert(pipe!= null); InstanceList instancelist = new InstanceList (pipe); Reader reader; try {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -