📄 computeupperbound1.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org. For furtherinformation, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import com.wcohen.secondstring.AbstractStringDistance;import com.wcohen.secondstring.Jaccard;import edu.umass.cs.mallet.base.fst.CRF;import edu.umass.cs.mallet.base.fst.Transducer;import edu.umass.cs.mallet.base.pipe.SerialPipes;import edu.umass.cs.mallet.base.pipe.iterator.FileIterator;import edu.umass.cs.mallet.base.pipe.iterator.LineGroupIterator;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;import java.io.*;import java.util.ArrayList;import java.util.logging.Logger;import java.util.regex.Pattern;public class ComputeUpperBound1 { String seperator = ""; private static Logger logger = Logger.getLogger(ComputeUpperBound1.class.getName()); private File crfFile; private CRF crf = null; private SerialPipes pipe; private TokenSequence tokenSequence; private Sequence viterbiSequence; private double confidence; private Transducer.ViterbiPath viterbiP; private Transducer.ViterbiPath_NBest viterbiP_NBest; private int instance_error_num = 0; private int instance_size = 0; private double instance_accuracy; private double[] instance_accuracy_nbest; boolean printFont = true; IEInterface ieInterface; InstanceList instancelist; ArrayList optimalViterbi; AbstractStringDistance nw; double default_Max_Dist = 0; double default_Ignore_Dist = 0; String[] startTags = new String[] {"<author>", "<title>", "<booktitle>", "<publisher>", "<journal>","<date>", "<location>", "<pages>", "<note>", "<institution>", "<editor>", "<volume>", "<tech>"}; String[] endTags = new String[] {"</author>", "</title>", "</booktitle>", "</publisher>", "</journal>", "</date>", "</location>", "</pages>", "</note>", "</institution>", "</editor>", "</volume>", "</tech>"}; double[] tagWeight = new double[]{1.0, 10.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; public ComputeUpperBound1() { this.crfFile = null; } public ComputeUpperBound1(File crfFile) { assert(crfFile != null); this.crfFile = crfFile; } // load in CRF and its pipe from a trained crfFile public boolean loadCRF() { ieInterface = new IEInterface(this.crfFile); boolean flag = ieInterface.loadCRF(crfFile); this.crf = ieInterface.crf; this.pipe = ieInterface.pipe;// nw = new NeedlemanWunsch(); // for edit distance// nw = new JaroWinkler();// nw = new CharJaccard(); nw = new Jaccard(); // nw = new JelinekMercerJS();// nw = new TFIDF(); // x// nw = new Mixture(); //x// nw = new DirichletJS();//x return flag; } //given an input file, label it, and output in the format of inline SGML public void viterbiCRF(File inputFile, boolean sgml, String seperator, int N) { instancelist = new InstanceList (pipe); Reader reader; try { reader = new FileReader (inputFile); } catch (Exception e) { throw new IllegalArgumentException ("Can't read file "+inputFile); } instancelist.add (new LineGroupIterator (reader, Pattern.compile(seperator), true)); ArrayList nbestlists = new ArrayList(instancelist.size()); for(int i=0; i<instancelist.size(); i++){ Instance instance = instancelist.getInstance(i); //N-best tagging viterbiP_NBest = crf.viterbiPath_NBest((Sequence)instance.getData(), N);//n-best list nbestlists.add(i, (Sequence[]) viterbiP_NBest.outputNBest()); } String outputFileStr = inputFile.toString() + "_tagged"; System.out.println(inputFile.toString() + " ---> " + outputFileStr); PrintStream taggedOut = null; try{ FileOutputStream fos = new FileOutputStream (outputFileStr); taggedOut = new PrintStream (fos); } catch (IOException e) { logger.warning ("Couldn't open output file '"+ outputFileStr+"'"); } if(taggedOut == null){ taggedOut = System.out; } System.out.print( nbestlists.size() + ": "); // using approximation //int[] indexList = indexListSearch_approximate(instancelist, nbestlists); // using exaustive search int[] indexList = indexListSearch_exaustive(instancelist, nbestlists, N); } protected int[] indexListSearch_exaustive(InstanceList instancelist, ArrayList nbestlists, int N) { int[] indexList = new int[instancelist.size()]; for(int i=0; i<indexList.length; i++){ indexList[i] = 0; } int[] optimalIndexList = (int[])indexList.clone(); double highestWeight = weightOfConfig(indexList, instancelist, nbestlists); while( hasNextIndexList(indexList, N) ){ indexList = nextIndexList(indexList, N);// System.out.print(num + ": " + nbestlists.size() + ": ");// for(int j=0; j<indexList.length; j++){// System.out.print(optimalIndexList[j]);// }// System.out.println(); double weight = weightOfConfig(indexList, instancelist, nbestlists); if( weight > highestWeight ){ highestWeight = weight; optimalIndexList = (int[])indexList.clone(); } }// System.out.println(instancelist.size()); return optimalIndexList; } protected double weightOfConfig(int[] indexList, InstanceList instancelist, ArrayList nbestlists) { double weight = 0; for(int i=0; i<indexList.length; i++){ Sequence[] lists1 = (Sequence[]) nbestlists.get(i); for(int j=i+1; j<indexList.length; j++){ Sequence[] lists2 = (Sequence[]) nbestlists.get(j); double sim = PairSimilarity(lists1[indexList[i]], lists2[indexList[j]], instancelist.getInstance(i), instancelist.getInstance(j)); weight += sim; } } return weight; } protected boolean hasNextIndexList(int[] indexList, int N) { for(int i=0; i<indexList.length; i++){ if(indexList[i] < N-1) return true; } return false; } protected int[] nextIndexList(int[] indexList, int N) { for(int i=indexList.length-1; i>=0; i--){ if(indexList[i] <= N-2){ indexList[i] ++; for(int j=i+1; j<=indexList.length-1;j++){ indexList[j] = 0; } break; } } return indexList; } protected int[] indexListSearch_approximate(InstanceList instancelist, ArrayList nbestlists) { int[] indexList = new int[instancelist.size()]; System.out.println(instancelist.size()); if(instancelist.size() == 1){ indexList[0] = 0; } else if(instancelist.size() == 2){ Sequence[] lists1 = (Sequence[]) nbestlists.get(0); Sequence[] lists2 = (Sequence[]) nbestlists.get(1); double highestSimilarity = Double.NEGATIVE_INFINITY; indexList[0] = indexList[1] = 0; for(int i=0; i<lists1.length; i++){ for(int j=0; j<lists2.length; j++){ double sim = PairSimilarity(lists1[i], lists2[j], instancelist.getInstance(0), instancelist.getInstance(1)); if(sim > highestSimilarity){ highestSimilarity = sim; indexList[0] = i; indexList[1] = j; } } } } else { //process the first two citations Sequence[] lists1 = (Sequence[]) nbestlists.get(0); Sequence[] lists2 = (Sequence[]) nbestlists.get(1); double highestSimilarity = Double.NEGATIVE_INFINITY; indexList[0] = indexList[1] = 0; for(int i=0; i<lists1.length; i++){ for(int j=0; j<lists2.length; j++){ double sim = PairSimilarity(lists1[i], lists2[j], instancelist.getInstance(0), instancelist.getInstance(1) ); if(sim > highestSimilarity){ highestSimilarity = sim; indexList[0] = i; indexList[1] = j; } } } //dynamically process the rest citations for(int i=2; i<instancelist.size(); i++){ indexList[i] = 0; Sequence[] sequence_prev = (Sequence[]) nbestlists.get(i-1); Sequence[] sequence_current = (Sequence[]) nbestlists.get(i); highestSimilarity = PairSimilarity(sequence_prev[indexList[i-1]], sequence_current[0], instancelist.getInstance(i-1), instancelist.getInstance(i) ); for(int j=1; j<sequence_current.length; j++){ double sim = PairSimilarity(sequence_prev[indexList[i-1]], sequence_current[j], instancelist.getInstance(i-1), instancelist.getInstance(i) ); if(sim > highestSimilarity){ indexList[i] = j; } } } } return indexList; } protected double computeSGMLObjDistance(String string1, String string2) { double dist = 0.0; double distTemp; int usedNumFields = 0; int NumFields = startTags.length; double totalWeight = 0;// System.out.println(string1 + "\n" + string2); for(int i=0; i<NumFields; i++){ String[] strs1 = locateFields(startTags[i], endTags[i], string1); String[] strs2 = locateFields(startTags[i], endTags[i], string2);/* if( startTags[i].equals("<author>") ){//only use last names if(strs1 != null) for(int k=0; k<strs1.length; k++){ ArrayList namelist1 = LastName(strs1[k]); String tempStr = ""; for(int j=0; j<namelist1.size(); j++){ tempStr += (String)namelist1.get(j); if( j<namelist1.size()-1){ tempStr += " "; } } strs1[k] = tempStr;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -