📄 ieinterface.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. *//** @author Fuchun Peng <a href="mailto:fuchun@cs.umass.edu">fuchun@cs.umass.edu</a> July 2003 This class provides information extraction interface to other applications */package edu.umass.cs.mallet.projects.seg_plus_coref.ie;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import edu.umass.cs.mallet.base.util.*;import junit.framework.*;import java.util.Iterator;import java.util.Random;import java.util.regex.*;import java.io.*;import java.util.logging.*;import java.util.ArrayList;public class IEInterface{ String seperator = ""; private static Logger logger = Logger.getLogger(IEInterface.class.getName()); private File crfFile; public CRF crf = null; public SerialPipes pipe; private TokenSequence tokenSequence; private Sequence viterbiSequence; private double confidence; private Transducer.ViterbiPath viterbiP; private Transducer.ViterbiPath_NBest viterbiP_NBest; private int instance_error_num = 0; private int instance_size = 0; private double instance_accuracy; private double[] instance_accuracy_nbest; static boolean printFont = true; String PUNT = "[,\\.;:?!()*]"; Pattern puntPattern = Pattern.compile(PUNT); boolean ignorePunct = true; public IEInterface() { this.crfFile = null; } public IEInterface(File crfFile) { assert(crfFile != null); this.crfFile = crfFile; } public void setPipe(SerialPipes pipe) { this.pipe = pipe; } public void replacePipe(int index, Pipe p) { assert(index < pipe.size()); pipe.replacePipe(index,p); } public void printPipe() { ArrayList pipes1 = (this.pipe).getPipes(); System.out.println("pipes1"); for (int i = 0; i < pipes1.size(); i++) { System.out.print("Pipe: " + i + ": "); Pipe tempP = (Pipe) pipes1.get (i); if (tempP == null) { System.out.println("Pipe is null"); } else { String pipeName = tempP.getClass().getName(); System.out.println(pipeName); if(tempP instanceof SerialPipes){ ArrayList pipes2 = ((SerialPipes)tempP).getPipes(); for(int j=0; j<pipes2.size(); j++){ System.out.print(" Pipe: " + j + ": "); Pipe tempP2 = (Pipe) pipes2.get(j); if(tempP2 == null){ System.out.println(" Pipe is null"); } else{ String pipeName2 = tempP2.getClass().getName(); System.out.println(pipeName2); } } } } } } // load in CRF and its pipe from a trained crfFile public boolean loadCRF() { return loadCRF(crfFile); } public boolean loadCRF(File crfFile) { assert(crfFile != null); CRF crf = null; try { ObjectInputStream ois = new ObjectInputStream(new FileInputStream( crfFile )); crf = (CRF) ois.readObject(); ois.close(); } catch (IOException e) { System.err.println("Exception reading crf file: " + e); crf= null; } catch (ClassNotFoundException cnfe) { System.err.println("Cound not find class reading in object: " + cnfe); crf= null; }// crf = CRFIO.readCRF(crfFile.toString()); if(crf==null) { System.err.println("Read a null crf from file: " + crfFile); System.exit(1); } this.crf = crf; this.pipe = (SerialPipes) crf.getInputPipe(); if (this.pipe == null) { System.err.println("Get a null pipe from CRF"); System.exit(1); } //xxx print out the read-in pipes, just for debugging purpose// printPipe();// System.out.println("================= start of CRF ============");// crf.print();// System.out.println("==================end of crf =============="); //xxx logger.log(Level.INFO, "Load CRF successfully\n"); return true; } public boolean loadCRF(CRF crf) { this.crf = crf; this.pipe = (SerialPipes) crf.getInputPipe(); if (this.pipe == null) { System.err.println("Get a null pipe from CRF"); return false; } return true; } public String printResultInFormat(boolean sgml) { return printResultInFormat(sgml, viterbiSequence, tokenSequence); } public static String printResultInFormat(boolean sgml, Sequence viterbiSequence, TokenSequence tokenSequence) { String viterbiStr = ""; assert(tokenSequence != null); assert(viterbiSequence != null); assert(tokenSequence.size() == viterbiSequence.size()); String font = ""; String current_font = ""; if(sgml){ String old_tag = null; String startTag, endTag; for(int i=0; i<tokenSequence.size(); i++){ Token token = (Token)tokenSequence.getToken(i); String word = token.getText(); String tag = viterbiSequence.get(i).toString(); if(tag != old_tag){ if(old_tag != null){ endTag = "</"+old_tag+">"; viterbiStr += endTag; } startTag = "<"+tag+">"; viterbiStr += startTag; old_tag = tag; } if(token.hasProperty("FONT")){ current_font = (String)token.getProperty("FONT"); } if(!current_font.equals(font) && printFont){ viterbiStr += "<font value=\""+current_font+"\" />"; font = current_font; } viterbiStr += word; viterbiStr += " "; if(i == tokenSequence.size() - 1){ endTag = "</"+tag+">"; viterbiStr += endTag; } if(token.hasProperty("LINE_END")){ viterbiStr += "\n"; } } } else{ for(int i=0; i<tokenSequence.size(); i++){ viterbiStr += ((Token)tokenSequence.getToken(i)).getText(); viterbiStr += ": "; viterbiStr += viterbiSequence.get(i).toString(); viterbiStr += "\n"; } } return viterbiStr; } //given an input string, label it, and output in the format of inline SGML public String viterbiCRFString(String line, boolean sgml) { Instance lineCarrier = new Instance(line, null, null, null, pipe); assert(pipe != null); Instance featureCarrier = pipe.pipe(lineCarrier, 0); assert(crf != null); viterbiP = crf.viterbiPath((Sequence)featureCarrier.getData()); viterbiSequence = viterbiP.output(); //confidence = Math.exp(-viterbiP.getCost()/viterbiSequence.size()); confidence = viterbiP.getCost(); tokenSequence = (TokenSequence)featureCarrier.getSource(); assert(viterbiSequence.size() == tokenSequence.size()); return printResultInFormat(sgml); } // to use this method successfully, tokenization should use "\\w+-\\w+|\\w+|'s|``|''|\\S" pattern // or change the wordPattern in WSJPOSSentence2TokenSequence to match your tokenization pattern. // public Sequence viterbiCRFTokenSequence(TokenSequence ts) { assert(crf != null); String line = ""; for(int i=0; i<ts.size(); i++){ line += ts.getToken(i).getText()+" ";// System.out.println(i+": "+ts.getToken(i).getText()); } assert(pipe != null); Instance lineCarrier = new Instance(line, null, null, null, pipe); viterbiP = crf.viterbiPath((Sequence)lineCarrier.getData()); viterbiSequence = viterbiP.output(); confidence = Math.exp(-viterbiP.getCost()/viterbiSequence.size());// viterbiSequence = crf.viterbiPath((Sequence)lineCarrier.getData()).output();// Sequence tempTS = (Sequence)lineCarrier.getData();// for(int i=0; i<tempTS.size(); i++){// System.out.println(i+": "+tempTS.get(i).toString() + "/" + viterbiSequence.get(i).toString());// } assert(viterbiSequence.size() == ts.size()): "ts.size=" + ts.size() + " " + "viterSequence.size=" + viterbiSequence.size(); return viterbiSequence; } public double InstanceAccuracy(Sequence viterbiSequence, Instance instance) { return InstanceAccuracy(viterbiSequence, (Sequence)instance.getTarget(), instance); } public double InstanceAccuracy(Sequence viterbiSequence, Sequence targetSequence) { return InstanceAccuracy(viterbiSequence, targetSequence, null); } public double InstanceAccuracy(Sequence viterbiSequence, Sequence targetSequence, Instance instance) { assert(viterbiSequence.size() == targetSequence.size()); instance_size = viterbiSequence.size(); instance_error_num = 0; // String PUNT = "[,\\.;:?!()*]";// Pattern puntPattern = Pattern.compile(PUNT); if(instance != null) tokenSequence = (TokenSequence)instance.getSource(); int totalNum = 0; for(int i=0; i<instance_size; i++){ if(instance != null){ String tokenStr = tokenSequence.getToken(i).getText(); if(puntPattern.matcher(tokenStr).matches() && ignorePunct ){//ignore punct; continue; } } totalNum ++; String predO = viterbiSequence.get(i).toString(); String trueO = targetSequence.get(i).toString();// System.out.println(i + " " + predO + " " + trueO + "\n"); if(!predO.equals(trueO)){ instance_error_num ++; } } double accuracy = 1- (double)instance_error_num/totalNum; return accuracy; } //viterbi for a piped instance public String viterbiCRFInstance(Instance instance, boolean sgml ) { assert(crf != null); viterbiP = crf.viterbiPath((Sequence)instance.getData());// regular viterbi viterbiSequence = viterbiP.output();// confidence = Math.exp(viterbiP.getCost()/viterbiSequence.size());// confidence = viterbiP.getCost()/viterbiSequence.size(); instance_accuracy= InstanceAccuracy(viterbiSequence, (Sequence)instance.getTarget(), instance); tokenSequence = (TokenSequence)instance.getSource(); assert(viterbiSequence.size() == tokenSequence.size()); return printResultInFormat(sgml); } public String viterbiCRFInstance_NBest(Instance instance, boolean sgml, int N ) { String str = ""; assert(crf != null); tokenSequence = (TokenSequence)instance.getSource(); assert(viterbiSequence.size() == tokenSequence.size());
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -