clusterpapersandvenues.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 716 行 · 第 1/2 页
JAVA
716 行
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org.  For furtherinformation, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.util.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import java.io.*;import java.util.*;import java.util.logging.*;import java.util.regex.*;import java.lang.reflect.Array;import com.wcohen.secondstring.*;import com.wcohen.secondstring.tokens.NGramTokenizer;import com.wcohen.secondstring.tokens.SimpleTokenizer;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.ExactFieldMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.PageMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.YearsWithinFivePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.FieldStringDistancePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.*;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEEvaluator;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.CRFIO;//import edu.umass.cs.mallet.users.hay.canopy.Util;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructor;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructorSimple;//import edu.umass.cs.mallet.users.hay.canopy.IndexFiles;//import edu.umass.cs.mallet.users.hay.canopy.CanopyMaker;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructorAuthDateTitle;//import salvo.jesus.graph.WeightedGraph;//import org.apache.lucene.analysis.Analyzer;//import org.apache.lucene.analysis.SimpleAnalyzer;/** Clusters papers and venues jointly using MultipleCorefCluterer */public class ClusterPapersAndVenues{	private static String[] SEPERATOR = new String[] {"<NEW_HEADER>", "<NEWREFERENCE>"};	private static CRF crf = null;	private static Pipe pipe;	private static IEInterface ieInterface;	private static IEInterface ieInterface1;	private static IEInterface ieInterface2;	private static IEInterface ieInterface3;	private static IEInterface ieInterface4;	private static StringDistance softtfidfPaper;	private static StringDistance tfidfPaper;	private static StringDistance tfidfVenue;	private static Jaccard distanceMetricEditDistPaper;	private static StringDistance triGramDistanceMetricPaper;	private static StringDistance triGramDistanceMetricVenue;	static CommandOption.Boolean fullPartition = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "full-partition", "FILENAME", false, false,	 "Use full partitioninig", null);	static CommandOption.Boolean useWeightedAvg = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-weighted-avg", "FILENAME", false, false,	 "Use weighted average", null);	static CommandOption.String loadMEFile = new CommandOption.String	(ClusterPapersAndVenues.class, "load-me-file", "FILENAME", true, null,	 "The name of the MaxEnt model file.", null);	static CommandOption.String outputFile = new CommandOption.String	(ClusterPapersAndVenues.class, "output-file", "FILENAME", true, null,	 "The name of the file where output clusters will be printed to.", null);	static CommandOption.String crfInputFile = new CommandOption.String	(ClusterPapersAndVenues.class, "crf-input-file", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.String crfInputFile1 = new CommandOption.String	(ClusterPapersAndVenues.class, "crf-input-file-1", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.String crfInputFile2 = new CommandOption.String	(ClusterPapersAndVenues.class, "crf-input-file-2", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.String crfInputFile3 = new CommandOption.String	(ClusterPapersAndVenues.class, "crf-input-file-3", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.String crfInputFile4 = new CommandOption.String	(ClusterPapersAndVenues.class, "crf-input-file-4", "FILENAME", true, null,	 "The name of the file to read the trained CRF for testing.", null);	static CommandOption.Boolean useCRF = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-crf", "BOOL", false, false,	 "Use CRF or not.", null);	static CommandOption.Boolean useMultipleCRFs = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-multiple-crfs", "BOOL", false, false,	 "Use a separate crf for each data segment or not.", null);	static CommandOption.Boolean useTreeModel = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-tree-model", "BOOL", false, false,	 "Use and train tree model.", null);	static CommandOption.Boolean useCorrelational = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-correlational", "BOOL", false, false,	 "Use Correlational Clustering or not, if not uses Greedy.", null);	static CommandOption.Boolean useFeatureInduction = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-feature-induction", "BOOL", false, false,	 "Use Feature Induction or Not.", null);	static CommandOption.Boolean useNBest = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-n-best", "BOOL", false, false,	 "Use NBest or not.", null);	static CommandOption.Boolean useTrueNumClusters = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-true-num-clusters", "BOOL", false, false,	 "Use NBest or not.", null);	static CommandOption.Boolean useOptimal = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "use-optimal", "BOOL", false, false,	 "Use NBest or not.", null);	static CommandOption.Integer optimalNBest = new CommandOption.Integer	(ClusterPapersAndVenues.class, "optimal-n-best", "INTEGER", true, -1,	 "Size of n, for searching for optimal n-best configuration.", null);	static CommandOption.Integer rBeamSize = new CommandOption.Integer	(ClusterPapersAndVenues.class, "r-beam-size", "INTEGER", true, 10,	 "Size of n, for searching for optimal n-best configuration.", null);	static CommandOption.String trainingDir1 = new CommandOption.String	(ClusterPapersAndVenues.class, "training-dir-1", "FILENAME", true, null,	 "Directory containing training files.", null);	static CommandOption.String trainingDir2 = new CommandOption.String	(ClusterPapersAndVenues.class, "training-dir-2", "FILENAME", true, null,	 "Directory containing training files.", null);	static CommandOption.String trainingDir3 = new CommandOption.String	(ClusterPapersAndVenues.class, "training-dir-3", "FILENAME", true, null,	 "Directory containing training files.", null);		static CommandOption.String testingDir = new CommandOption.String	(ClusterPapersAndVenues.class, "testing-dir", "FILENAME", true, null,	 "Directory containing testing files.", null);	static CommandOption.Integer searchIters = new CommandOption.Integer	(ClusterPapersAndVenues.class, "search-iters", "INTEGER", true, 3,	 "Number of search iterations.", null);	static CommandOption.Integer searchReductions = new CommandOption.Integer	(ClusterPapersAndVenues.class, "search-reductions", "INTEGER", true, 5,	 "Number of search reductions.", null);	static CommandOption.Integer numNBest = new CommandOption.Integer	(ClusterPapersAndVenues.class, "num-n-best", "INTEGER", true, 3,	 "Number of n-best candidates to store.", null);	static CommandOption.Integer nthViterbi = new CommandOption.Integer	(ClusterPapersAndVenues.class, "nth-viterbi", "INTEGER", true, 0,	 "Number of n-best candidates to use .", null);	static CommandOption.Boolean trainUsingLabeled = new CommandOption.Boolean	(ClusterPapersAndVenues.class, "train-using-labeled", "BOOL", true, false,	 "Train just using the labeled data, but test on CRF output", null);	static final CommandOption.List commandOptions = 	new CommandOption.List (		"Training, testing and running information extraction on paper header or reference.",		new CommandOption[] {			useWeightedAvg,			trainUsingLabeled,			rBeamSize,			loadMEFile,			useTreeModel,			fullPartition,			outputFile,			useOptimal,			crfInputFile,			crfInputFile1,			crfInputFile2,			crfInputFile3,			crfInputFile4,			useCRF,			useMultipleCRFs,			useFeatureInduction,			useCorrelational,						useNBest,			optimalNBest,			useTrueNumClusters,			trainingDir1,			trainingDir2,			trainingDir3,						testingDir,			searchIters,			searchReductions,			numNBest,			nthViterbi		});	private static Logger logger = MalletLogger.getLogger (ClusterPapersAndVenues.class.getName());		public static void main (String[] args) throws FileNotFoundException	{		commandOptions.process (args);		commandOptions.logOptions (logger);		boolean newCluster = true;		loadCRFs();		// create paper nodes		logger.info ("Creating Paper Nodes");		ArrayList[] paperTrainingNodes = createNodesFromTraining (CitationUtils.PAPER);		ArrayList paperTestingNodes = createNodesFromTesting (CitationUtils.PAPER);		ArrayList allPaperTrainingNodes = new ArrayList ();		for (int i=0; i < paperTrainingNodes.length; i++)			allPaperTrainingNodes.addAll (paperTrainingNodes[i]);				System.out.println("finished computing nodes for PAPER, about to compute distanceMetric params ");		triGramDistanceMetricPaper = getDistanceMetric (allPaperTrainingNodes);		AbstractStatisticalTokenDistance distanceMetricPaper =	    (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (allPaperTrainingNodes);		Pipe paperPipe = getPaperPipe(distanceMetricPaper, triGramDistanceMetricPaper);		InstanceList paperTraining = getTrainingList (paperTrainingNodes, paperPipe);				InstanceList paperTesting = CitationUtils.makePairs(paperPipe, paperTestingNodes);		//Collection paperKey = CitationUtils.makeCollections(allPaperTrainingNodes); // make key collections		Collection paperTestKey = CitationUtils.makeCollections(paperTestingNodes);		// create venue nodes		logger.info ("Creating Venue Nodes");		ArrayList[] venueTrainingNodes = createNodesFromTraining (CitationUtils.VENUE);		ArrayList venueTestingNodes = createNodesFromTesting (CitationUtils.VENUE);		ArrayList allVenueTrainingNodes = new ArrayList ();		for (int i=0; i < venueTrainingNodes.length; i++)			allVenueTrainingNodes.addAll (venueTrainingNodes[i]);				System.out.println("finished computing nodes for VENUE, about to compute distanceMetric params ");		triGramDistanceMetricVenue = getDistanceMetric (allVenueTrainingNodes);		AbstractStatisticalTokenDistance distanceMetricVenue =	    (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (allVenueTrainingNodes);				Pipe venuePipe = getVenuePipe(distanceMetricVenue, triGramDistanceMetricVenue);		InstanceList venueTraining = getTrainingList (venueTrainingNodes, venuePipe);				InstanceList venueTesting = CitationUtils.makePairs(venuePipe, venueTestingNodes);		//Collection venueKey = CitationUtils.makeCollections(allVenueTrainingNodes); // make key collections		Collection venueTestKey = CitationUtils.makeCollections(venueTestingNodes);								FeatureInducer fi = null;		/*		// try doing some feature induction now		if (useFeatureInduction.value()) {	    RankedFeatureVector.Factory gainFactory = null;	    gainFactory = new InfoGain.Factory();	    fi = new FeatureInducer (gainFactory,															 paperTraining, 10);	    fi.induceFeaturesFor(paperTraining, false, false);		}		*/		TreeModel tmodel = null;		if (useTreeModel.value()) {			throw new UnsupportedOperationException ("Tree model not supported yet.");	    /*if (pubs2 != null && pubs3 != null) {				tmodel = new TreeModel(paperInstancePipe, paperTrainingNodes[0], paperTrainingNodes[1], paperTrainingNodes[2], pubs1, pubs3, pubs3);	    }	    else {				tmodel = new TreeModel(instancePipe, nodes1, pubs1);				}*/	    //tmodel.setMultiTree (true);		}		/*		if (useFeatureInduction.value()) {	    System.out.println("\n\nINDUCING FEATURES FOR TEST INSTANCES");	    fi.induceFeaturesFor(paperTesting, false, false);		}		*/		MultipleCorefClusterer cl = null;		MultipleCorefClusterer paperCl = null;		MultipleCorefClusterer venueCl = null;				if (newCluster) {			cl = new MultipleCorefClusterer(new Pipe[] {paperPipe, venuePipe});			paperCl = new MultipleCorefClusterer(new Pipe[] {paperPipe});			venueCl = new MultipleCorefClusterer(new Pipe[] {venuePipe});			initializeClusterer (cl);			initializeClusterer (paperCl);			initializeClusterer (venueCl);	    if (loadMEFile.value() != null) {				throw new UnsupportedOperationException ("Loading MaxEnt not implemented yet");				//cl.loadME(loadMEFile.value());			}	    else {				cl.train(new InstanceList[] {paperTraining, venueTraining});				paperCl.train(new InstanceList[] {paperTraining});				venueCl.train(new InstanceList[] {venueTraining});			}	    cl.testClassifiers(new InstanceList[] {paperTesting, venueTesting});	    paperCl.testClassifiers(new InstanceList[] {paperTesting});	    venueCl.testClassifiers(new InstanceList[] {venueTesting});		}		Collection[] testKeys = new Collection[] {paperTestKey, venueTestKey};		Collection[] paperTestKeys = new Collection[] {paperTestKey};		Collection[] venueTestKeys = new Collection[] {venueTestKey};		// xxx keyPartitioning not implemented correctly in MultipleCorefClusterer		cl.setKeyPartitioning (paperTestKey);		if (newCluster) {	    Collection[] testS = cl.clusterMentions(new InstanceList[] {paperTesting, venueTesting},																							new List[] {paperTestingNodes, venueTestingNodes},																							-1, useCorrelational.value());			logger.info ("Evaluating " + testS.length + " type(s) of clusterings");			for (int ti=0; ti < testS.length; ti++) {				CitationUtils.evaluateClustering (testKeys[ti], testS[ti],  String.valueOf(ti) +																					" JOINT COREFERENCE RESULTS");				if (outputFile.value() != null)					printClustersToFile (testS[ti], outputFile.value() + "_" + String.valueOf(ti));			}	    Collection[] paperTestS = paperCl.clusterMentions(new InstanceList[] {paperTesting},																							new List[] {paperTestingNodes},																							-1, useCorrelational.value());				    Collection[] venueTestS = venueCl.clusterMentions(new InstanceList[] {venueTesting},																							new List[] {venueTestingNodes},																							-1, useCorrelational.value());			CitationUtils.evaluateClustering (paperTestKeys[0], paperTestS[0], "SOLO PAPER COREFERENCE RESULTS");			CitationUtils.evaluateClustering (venueTestKeys[0], venueTestS[0], "SOLO VENUE COREFERENCE RESULTS");								}				}	private static void initializeClusterer (MultipleCorefClusterer cl) {		cl.setTrueNumStop (useTrueNumClusters.value());		cl.setConfWeightedScores(useWeightedAvg.value());		cl.setOptimality (useOptimal.value());		cl.setRBeamSize (rBeamSize.value());		cl.setNBestInference (useNBest.value()); // actually use n-best list in		//coref		cl.setFullPartition(fullPartition.value());		int si = searchIters.value();		int	sd = searchReductions.value();		cl.setSearchParams (si, sd);	}	private static void loadCRFs () {		if (useCRF.value() == true) {	    if (useMultipleCRFs.value() == true) {				System.out.println("Initializing CRF");				File crfFile1 = new File(crfInputFile1.value());
clusterpapersandvenues.java - 源码说明

本页面展示了「mallet是自然语言处理、机器学习领域的一个开源项目。」中的 clusterpapersandvenues.java 源码文件，采用 Java 编程语言编写，共 716 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mallet相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?