⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 venuecoreference.java

📁 这是一个matlab的java实现。里面有许多内容。请大家慢慢捉摸。
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org.  For furtherinformation, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.util.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import java.io.*;import java.util.*;import java.util.logging.*;import java.util.regex.*;import java.lang.reflect.Array;import com.wcohen.secondstring.*;import com.wcohen.secondstring.tokens.NGramTokenizer;import com.wcohen.secondstring.tokens.SimpleTokenizer;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.ExactFieldMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.PageMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.YearsWithinFivePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.FieldStringDistancePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.*;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEEvaluator;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.CRFIO;//import edu.umass.cs.mallet.users.hay.canopy.Util;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructor;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructorSimple;//import edu.umass.cs.mallet.users.hay.canopy.IndexFiles;//import edu.umass.cs.mallet.users.hay.canopy.CanopyMaker;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructorAuthDateTitle;//import salvo.jesus.graph.WeightedGraph;//import org.apache.lucene.analysis.Analyzer;//import org.apache.lucene.analysis.SimpleAnalyzer;public class VenueCoreference{	private static String[] SEPERATOR = new String[] {"<NEW_HEADER>", "<NEWREFERENCE>"};	private static CRF crf = null;	private static Pipe pipe;	private static IEInterface ieInterface;	private static IEInterface ieInterface1;	private static IEInterface ieInterface2;	private static IEInterface ieInterface3;	private static IEInterface ieInterface4;	private static StringDistance softtfidf;	private static StringDistance tfidf;	private static Jaccard distanceMetricEditDist;	private static StringDistance triGramDistanceMetric;	static CommandOption.Boolean fullPartition = new CommandOption.Boolean	(TUI.class, "full-partition", "FILENAME", false, false,	 "Use full partitioninig", null);	static CommandOption.Boolean useWeightedAvg = new CommandOption.Boolean	(TUI.class, "use-weighted-avg", "FILENAME", false, false,	 "Use weighted average", null);	static CommandOption.String loadMEFile = new CommandOption.String	(TUI.class, "load-me-file", "FILENAME", true, null,	 "The name of the MaxEnt model file.", null);	static CommandOption.String outputFile = new CommandOption.String	(TUI.class, "output-file", "FILENAME", true, null,	 "The name of the file where output clusters will be printed to.", null);	static CommandOption.String crfInputFile = new CommandOption.String	(TUI.class, "crf-input-file", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.String crfInputFile1 = new CommandOption.String	(TUI.class, "crf-input-file-1", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.String crfInputFile2 = new CommandOption.String	(TUI.class, "crf-input-file-2", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.String crfInputFile3 = new CommandOption.String	(TUI.class, "crf-input-file-3", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.String crfInputFile4 = new CommandOption.String	(TUI.class, "crf-input-file-4", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.Boolean useCRF = new CommandOption.Boolean	(TUI.class, "use-crf", "BOOL", false, false,	 "Use CRF or not.", null);	static CommandOption.Boolean useMultipleCRFs = new CommandOption.Boolean	(TUI.class, "use-multiple-crfs", "BOOL", false, false,	 "Use a separate crf for each data segment or not.", null);	static CommandOption.Boolean useTreeModel = new CommandOption.Boolean	(TUI.class, "use-tree-model", "BOOL", false, false,	 "Use and train tree model.", null);	static CommandOption.Boolean useCorrelational = new CommandOption.Boolean	(TUI.class, "use-correlational", "BOOL", false, false,	 "Use Correlational Clustering or not, if not uses Greedy.", null);	static CommandOption.Boolean useFeatureInduction = new CommandOption.Boolean	(TUI.class, "use-feature-induction", "BOOL", false, false,	 "Use Feature Induction or Not.", null);	static CommandOption.Boolean useNBest = new CommandOption.Boolean	(TUI.class, "use-n-best", "BOOL", false, false,	 "Use NBest or not.", null);	static CommandOption.Boolean useTrueNumClusters = new CommandOption.Boolean	(TUI.class, "use-true-num-clusters", "BOOL", false, false,	 "Use NBest or not.", null);	static CommandOption.Boolean useOptimal = new CommandOption.Boolean	(TUI.class, "use-optimal", "BOOL", false, false,	 "Use NBest or not.", null);	static CommandOption.Integer optimalNBest = new CommandOption.Integer	(TUI.class, "optimal-n-best", "INTEGER", true, -1,	 "Size of n, for searching for optimal n-best configuration.", null);	static CommandOption.Integer rBeamSize = new CommandOption.Integer	(TUI.class, "r-beam-size", "INTEGER", true, 10,	 "Size of n, for searching for optimal n-best configuration.", null);	static CommandOption.String trainingDir1 = new CommandOption.String	(TUI.class, "training-dir-1", "FILENAME", true, null,	 "Directory containing training files.", null);	static CommandOption.String trainingDir2 = new CommandOption.String	(TUI.class, "training-dir-2", "FILENAME", true, null,	 "Directory containing training files.", null);	static CommandOption.String trainingDir3 = new CommandOption.String	(TUI.class, "training-dir-3", "FILENAME", true, null,	 "Directory containing training files.", null);		static CommandOption.String testingDir = new CommandOption.String	(TUI.class, "testing-dir", "FILENAME", true, null,	 "Directory containing testing files.", null);	static CommandOption.Integer searchIters = new CommandOption.Integer	(TUI.class, "search-iters", "INTEGER", true, 3,	 "Number of search iterations.", null);	static CommandOption.Integer searchReductions = new CommandOption.Integer	(TUI.class, "search-reductions", "INTEGER", true, 5,	 "Number of search reductions.", null);	static CommandOption.Integer numNBest = new CommandOption.Integer	(TUI.class, "num-n-best", "INTEGER", true, 3,	 "Number of n-best candidates to store.", null);	static CommandOption.Integer nthViterbi = new CommandOption.Integer	(TUI.class, "nth-viterbi", "INTEGER", true, 0,	 "Number of n-best candidates to use .", null);	static CommandOption.Boolean trainUsingLabeled = new CommandOption.Boolean	(TUI.class, "train-using-labeled", "BOOL", true, false,	 "Train just using the labeled data, but test on CRF output", null);	static final CommandOption.List commandOptions = 	new CommandOption.List (		"Training, testing and running information extraction on paper header or reference.",		new CommandOption[] {			useWeightedAvg,			trainUsingLabeled,			rBeamSize,			loadMEFile,			useTreeModel,			fullPartition,			outputFile,			useOptimal,			crfInputFile,			crfInputFile1,			crfInputFile2,			crfInputFile3,			crfInputFile4,			useCRF,			useMultipleCRFs,			useFeatureInduction,			useCorrelational,						useNBest,			optimalNBest,			useTrueNumClusters,			trainingDir1,			trainingDir2,			trainingDir3,						testingDir,			searchIters,			searchReductions,			numNBest,			nthViterbi		});	private static Logger logger = MalletLogger.getLogger (VenueCoreference.class.getName());		/*	// this method simply places each node (citation) in a publication object	// this needs to be reworked when we consider how Publications and Citations	// actually interact - i.e. are Publications nodes in the graph - or just ciations	protected static ArrayList computePublications (ArrayList nodes) {	ArrayList pubs = new ArrayList();	for (int i=0; i<nodes.size(); i++) {	pubs.add(new Publication ((Node)nodes.get(i)));	}	return pubs;	}*/	public static void main (String[] args) throws FileNotFoundException	{		commandOptions.process (args);		commandOptions.logOptions (logger);		boolean oldCluster = false;		boolean newCluster = true;		if (useCRF.value() == true) {	    if (useMultipleCRFs.value() == true) {				System.out.println("Initializing CRF");				File crfFile1 = new File(crfInputFile1.value());				ieInterface1 = new IEInterface(crfFile1);				ieInterface1.loadCRF(crfFile1);				File crfFile2 = new File(crfInputFile2.value());				ieInterface2 = new IEInterface(crfFile2);				ieInterface2.loadCRF(crfFile2);				File crfFile3 = new File(crfInputFile3.value());				ieInterface3 = new IEInterface(crfFile3);				ieInterface3.loadCRF(crfFile3);				File crfFile4 = new File(crfInputFile4.value());				ieInterface4 = new IEInterface(crfFile4);				ieInterface4.loadCRF(crfFile4);	    } else {				File crfFile = new File(crfInputFile.value());				ieInterface = new IEInterface(crfFile);				ieInterface.loadCRF(crfFile);	    }		}		if (useNBest.value() == true) {	    System.out.println("Using n-best CRF");		}		FileIterator trainFI_1 = null;		FileIterator trainFI_2 = null;		FileIterator trainFI_3 = null;		if (useCRF.value() == true) {	    trainFI_1 = new FileIterator (trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*")));	    if (trainingDir2.value() != null)				trainFI_2 = new FileIterator (trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*")));	    if (trainingDir3.value() != null)				trainFI_3 = new FileIterator (trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*")));					}		else {	    trainFI_1 = new FileIterator (trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*")));	    if (trainingDir2.value() != null)				trainFI_2 = new FileIterator (trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*")));	    if (trainingDir3.value() != null)				trainFI_3 = new FileIterator (trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*")));					}				ArrayList trainFileArray1 = trainFI_1.getFileArray();		ArrayList pubs1 = new ArrayList();		System.out.println("Number of files 1: " + trainFileArray1.size());		//ArrayList nodes1 = computeNodesWPubs(trainFileArray1, pubs1,		//ieInterface1);		ArrayList nodes1;		if (useMultipleCRFs.value() == true) {	    if (useTreeModel.value())				nodes1 = CitationUtils.computeNodesWPubs(trainFileArray1, pubs1, ieInterface1, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value());	    else				nodes1 = CitationUtils.computeNodes(trainFileArray1, ieInterface1, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), CitationUtils.VENUE);		}																																else {	    if (useTreeModel.value())				nodes1 = CitationUtils.computeNodesWPubs(trainFileArray1, pubs1, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value());	    else				nodes1 = CitationUtils.computeNodes(trainFileArray1, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), CitationUtils.VENUE);		}		ArrayList nodes2 = null;		ArrayList nodes3 = null;		ArrayList pubs2 = null;		ArrayList pubs3 = null;		if (trainFI_2 != null) {	    ArrayList trainFileArray2 = trainFI_2.getFileArray();	    pubs2 = new ArrayList ();	    System.out.println("Number of files 2: " + trainFileArray2.size());	    if (useMultipleCRFs.value() == true) {				if (useTreeModel.value()) 					nodes2 = CitationUtils.computeNodesWPubs(trainFileArray2, pubs2, ieInterface2, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value());				else					nodes2 = CitationUtils.computeNodes(trainFileArray2, ieInterface2, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), CitationUtils.VENUE);	    }	    else {				if (useTreeModel.value())					nodes2 = CitationUtils.computeNodesWPubs(trainFileArray2, pubs2, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value());				else					nodes2 = CitationUtils.computeNodes(trainFileArray2, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), CitationUtils.VENUE);	    }		}		if (trainFI_3 != null) {	    ArrayList trainFileArray3 = trainFI_3.getFileArray();	    pubs3 = new ArrayList();	    System.out.println("Number of files 3: " + trainFileArray3.size());	    //nodes3 = computeNodesWPubs(trainFileArray3, pubs3, ieInterface3);	    if (useMultipleCRFs.value() == true) {				if (useTreeModel.value())					nodes3 = CitationUtils.computeNodesWPubs(trainFileArray3, pubs3, ieInterface3, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value());				else					nodes3 = CitationUtils.computeNodes(trainFileArray3, ieInterface3, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), CitationUtils.VENUE);	    }	    else {				if (useTreeModel.value())					nodes3 = CitationUtils.computeNodesWPubs(trainFileArray3, pubs3, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value());				else					nodes3 = CitationUtils.computeNodes(trainFileArray3, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), CitationUtils.VENUE);	    }	    System.out.println(" There are " + nodes3.size() + " training nodes");		}		FileIterator testFI = null;		if (useCRF.value() == true)	    testFI = new FileIterator (testingDir.value(), new RegexFileFilter(Pattern.compile(".*")));		else		    testFI = new FileIterator (testingDir.value(), new RegexFileFilter(Pattern.compile(".*")));				ArrayList testFileArray = testFI.getFileArray();		ArrayList testPubList = new ArrayList();		ArrayList test_nodes;		if (useMultipleCRFs.value() == true) {	    test_nodes = CitationUtils.computeNodes(testFileArray,ieInterface4, false, numNBest.value(), nthViterbi.value(), CitationUtils.VENUE);		}		else {	    if (useTreeModel.value())				test_nodes = CitationUtils.computeNodesWPubs(testFileArray, testPubList, ieInterface, useCRF.value(), numNBest.value(), nthViterbi.value());	    else				test_nodes = CitationUtils.computeNodes(testFileArray, ieInterface, useCRF.value(), numNBest.value(), nthViterbi.value(), CitationUtils.VENUE);		}		//double testingCRFscore = scoreCitations (test_nodes);		//ArrayList test_nodes = computeNodesWPubs(testFileArray, testPubList, ieInterface4);		ArrayList allnodes = new ArrayList();  // all nodes, both training and		// test				allnodes.addAll(nodes1);		if (nodes2 != null)	    allnodes.addAll(nodes2);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -