⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 citationutils.java

📁 这是一个matlab的java实现。里面有许多内容。请大家慢慢捉摸。
💻 JAVA
字号:
package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.util.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import java.io.*;import java.util.*;import java.util.logging.*;import java.util.regex.*;import java.lang.reflect.Array;import com.wcohen.secondstring.*;import com.wcohen.secondstring.tokens.NGramTokenizer;import com.wcohen.secondstring.tokens.SimpleTokenizer;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.ExactFieldMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.PageMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.YearsWithinFivePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.FieldStringDistancePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEEvaluator;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.CRFIO;public class CitationUtils {	public final static String PAPER = "PAPER";	public final static String VENUE = "VENUE";	public final static String AUTHOR = "AUTHOR";		public static String[] SEPERATOR = new String[] {"<NEW_HEADER>", "<NEWREFERENCE>"};	public static ArrayList computeNodes(ArrayList trainFileArray, IEInterface ieInterface,																			 boolean useCRFLocal,																			 int numNBest, int nthViterbi ) {		return computeNodes(trainFileArray, ieInterface, useCRFLocal, numNBest, nthViterbi, PAPER);	}	public static void addPaperFromLine (String str, ArrayList nodes,																				 IEInterface ieInterface,																				 boolean useCRFLocal,																				 int numNBest, int nthViterbi) {						// xxx why is label made fileID only to be overwritten?		// does code require that files correspond to clusters?!?		// <meta reference_no="10" cluster_no="2"></meta>		HashMap clusterAttributes = SGMLStringOperation.locateAttributes ("meta", str); 		if (clusterAttributes.size() != 2) 			throw new IllegalArgumentException ("Reference has no paper label tag: " + str);		String label = (String)clusterAttributes.get ("cluster_no");		Integer id = new Integer ((String)clusterAttributes.get ("reference_no"));		String start_tag = "<meta"; 		String end_tag   = "</meta>";		str = str.substring(str.indexOf(end_tag)+end_tag.length(), str.length());		str = str.intern();		if (useCRFLocal) {			nodes.add(new Citation(str, label, id.intValue(), ieInterface,														 numNBest, nthViterbi));		} else {			nodes.add(new Citation(str, label, id.intValue()));		}	}			public static void addVenuesFromLine (String str, ArrayList nodes,																					IEInterface ieInterface,																					boolean useCRFLocal,																					int numNBest, int nthViterbi) {		HashMap clusterAttributes = SGMLStringOperation.locateAttributes ("booktitle", str);		if (clusterAttributes.size() != 3)			clusterAttributes = SGMLStringOperation.locateAttributes ("journal", str); 		if (clusterAttributes.size() != 3) 			return;		String label = (String)clusterAttributes.get ("venue_cluster");		if (label == null)			throw new IllegalArgumentException ("bad venue line: " + str);		Integer id = new Integer ((String)clusterAttributes.get ("venue_no"));		String start_tag = "<meta"; 		String end_tag   = "</meta>";		str = str.substring(str.indexOf(end_tag)+end_tag.length(), str.length());		str = str.intern();		if (useCRFLocal) {			nodes.add(new Citation(str, label, id.intValue(), ieInterface,														 numNBest, nthViterbi));		} else {			nodes.add(new Citation(str, label, id.intValue()));		}			}	public static void addAuthorsFromLine (String str, ArrayList nodes,																					 IEInterface ieInterface,																					 boolean useCRFLocal,																					 int numNBest, int nthViterbi) {					}	/** Build nodes with labels corresponding to cluster ids. Passing			different strings for nodeType will make node correspond to the			paper, author, or venue, etc. Default is paper.			@param trainFileArray list of citation files			@param ieInterface the interface to the extraction			@param useCRFLocal use a CRF for segmentation			@param numNBest number of viterbi paths to use			@param nodeType type of node to create (paper, author, venue...)	*/	public static ArrayList computeNodes(ArrayList trainFileArray,																			 IEInterface ieInterface,																			 boolean useCRFLocal,																			 int numNBest,																			 int nthViterbi,																			 String nodeType)	{		System.out.println("Computing nodes...");		long timeStart = System.currentTimeMillis();		Reader reader;		ArrayList nodes = new ArrayList();				HashMap hMap = new HashMap();  // keys are cluster IDs, values are publications		int index = 0;		for(int i=0; i<trainFileArray.size(); i++){	    File file = (File)trainFileArray.get(i);	    String fileID = file.toString();	    System.out.println(i + ": " + fileID );	    try {				reader = new FileReader (file);	    } catch (Exception e) {				throw new IllegalArgumentException ("Can't read file "+file);	    }	    LineGroupIterator lineI = new LineGroupIterator (reader, Pattern.compile(SEPERATOR[1]), true);	    while(lineI.hasNext()){				String str = lineI.getLineGroup();				if (nodeType.equals (PAPER))					addPaperFromLine (str, nodes, ieInterface, useCRFLocal, numNBest, nthViterbi);				else if (nodeType.equals (VENUE))					addVenuesFromLine (str, nodes, ieInterface, useCRFLocal, numNBest, nthViterbi);				else if (nodeType.equals (AUTHOR))					addAuthorsFromLine (str, nodes, ieInterface, useCRFLocal, numNBest, nthViterbi);				lineI.nextLineGroup();	    }		}		long timeEnd = System.currentTimeMillis();		double timeElapse = (timeEnd - timeStart)/(1000.000);		System.out.println("Time elapses " + timeElapse + " seconds for computing " + nodes.size() + " nodes.");		return nodes;	}	public static ArrayList computeNodesWPubs(ArrayList trainFileArray,																						ArrayList publications,																						IEInterface ieInterface,																						int numNBest, int nthViterbi) {		return computeNodesWPubs(trainFileArray, publications, ieInterface, false, numNBest, nthViterbi);	}		// xxx generalize this to make venues/authors objects as in	// computeNodes above.	protected static ArrayList computeNodesWPubs(ArrayList trainFileArray,																							 ArrayList publications,																							 IEInterface ieInterface,																							 boolean useCRFLocal, int numNBest,																							 int nthViterbi) {		System.out.println("Computing nodes...");		long timeStart = System.currentTimeMillis();		Reader reader;		ArrayList nodes = new ArrayList();		HashMap hMap = new HashMap();  // keys are cluster IDs, values are publications		int index = 0;		for(int i=0; i<trainFileArray.size(); i++){	    File file = (File)trainFileArray.get(i);	    String fileID = file.toString();	    System.out.println(i + ": " + fileID );	    try {				reader = new FileReader (file);	    } catch (Exception e) {				throw new IllegalArgumentException ("Can't read file "+file);	    }	    LineGroupIterator lineI = new LineGroupIterator (reader, Pattern.compile(SEPERATOR[1]), true);	    while(lineI.hasNext()){				String str = lineI.getLineGroup();				Integer id = new Integer(index++);				String label = fileID;				// <meta reference_no="10" cluster_no="2"></meta>				HashMap clusterAttributes = SGMLStringOperation.locateAttributes ("meta", str);				if (clusterAttributes.size() != 2) 					throw new IllegalArgumentException ("Reference has no cluster or reference id: " + str);				label = (String)clusterAttributes.get ("cluster_no");				id = new Integer ((String)clusterAttributes.get ("reference_no"));								/*				String s = SGMLStringOperation.locateField(start_tag, end_tag, str);				String[] ss = s.split("\"");				if (ss != null && ss.length == 5) {					label = ss[3];					label.intern();					id = new Integer(ss[1]);				}				*/				String start_tag = "<meta"; 				String end_tag   = "</meta>";				str = str.substring(str.indexOf(end_tag)+end_tag.length(), str.length());				str = str.intern();				//str = str.toLowerCase();				Citation cit = null;				if (useCRFLocal) {					cit = new Citation(str, label, id.intValue(), ieInterface,														 numNBest, nthViterbi);				} else {					cit = new Citation(str, label, id.intValue());				}				nodes.add(cit);				Publication p = (Publication)hMap.get(label); // look up publication that this				// belongs to				if (p != null) { 					p.addNewCitation (cit);  // add citation to publication				} else {					p = new Publication (cit); // create new publication with this citation					hMap.put(label, p); // add publication to hash map					publications.add(p);				}								lineI.nextLineGroup();	    }		}		long timeEnd = System.currentTimeMillis();		double timeElapse = (timeEnd - timeStart)/(1000.000);		System.out.println("Time elapses " + timeElapse + " seconds for computing nodes.");		return nodes;	}	public static StringDistance computeDistanceMetric (ArrayList nodes) {		ArrayList allStrings = new ArrayList();		StringDistance tfidf = new TFIDF ();				for (int i=0; i < nodes.size(); i++) {	    //Citation c = (Citation)((Node)nodes.get(i)).getObject();	    Citation c = (Citation)nodes.get(i);	    allStrings.addAll(c.getAllStringsWrapped());		}		tfidf.accumulateStatistics(allStrings.iterator());		return tfidf;		//return new SoftTFIDF(tfidf);	}	public static void makeDistMetric(List list, StringDistance tfidf,																		StringDistance triGramDistanceMetric) {		List nodes = new ArrayList();		Iterator iter = list.iterator();		while (iter.hasNext()) {	    nodes.add((Citation)iter.next());		}		NGramTokenizer nGramTokenizer =	    new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true));				ArrayList allStrings = new ArrayList();		tfidf = new TFIDF ();		//softtfidf = new SoftTFIDF(new JaroWinkler(), 0.8);		triGramDistanceMetric = new TFIDF(nGramTokenizer);		for (int i=0; i < nodes.size(); i++) {	    Citation c = (Citation)nodes.get(i);	    allStrings.addAll(c.getAllStringsWrapped());		}		tfidf.accumulateStatistics(allStrings.iterator());		triGramDistanceMetric.accumulateStatistics(allStrings.iterator());		//softtfidf.accumulateStatistics(allStrings.iterator());	}	public static InstanceList makePairs(Pipe instancePipe, ArrayList nodes) {		System.out.println("PairIterator...");		long timeStart = System.currentTimeMillis();		InstanceList ilist = new InstanceList (instancePipe);		ilist.add (new NodePairIterator (nodes) );		System.out.println("====");		long timeEnd = System.currentTimeMillis();		double timeElapse = (timeEnd - timeStart)/(1000.000);		System.out.println("Time elapses " + timeElapse + " seconds for computing pair iterator.");		return ilist;	}	public static InstanceList makePairs(Pipe instancePipe, ArrayList nodes, List pairs) {		System.out.println("PairIterator...");		long timeStart = System.currentTimeMillis();		InstanceList ilist = new InstanceList (instancePipe);		ilist.add (new NodePairIterator (nodes, pairs) );		System.out.println("====");		long timeEnd = System.currentTimeMillis();		double timeElapse = (timeEnd - timeStart)/(1000.000);		System.out.println("Time elapses " + timeElapse + " seconds for computing pair iterator.");		return ilist;	}	// this version assumes nodes are actually citations	public static Collection makeCollections (ArrayList nodes) {		HashMap map = new HashMap(); // keep an index of node label values to collections		Collection collection = new LinkedHashSet();		for (int i=0; i < nodes.size(); i++) {	    Citation n = (Citation)nodes.get(i);	    Object o1 = n.getLabel();	    Collection c = (Collection)map.get(o1);	    if (c != null) {				c.add(n);				//System.out.println("adding new node " + n + " to existing collection with " + o1);	    } else {				Collection newC = new LinkedHashSet();				//System.out.println("Creating new collection -> id: " + o1);				newC.add(n);				map.put(o1, newC);	    }		}		Iterator i1 = map.values().iterator();		while (i1.hasNext()) {	    collection.add((Collection)i1.next());		}		return collection;	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -