📄 clusterpapersandvenues.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org. For furtherinformation, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.util.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import java.io.*;import java.util.*;import java.util.logging.*;import java.util.regex.*;import java.lang.reflect.Array;import com.wcohen.secondstring.*;import com.wcohen.secondstring.tokens.NGramTokenizer;import com.wcohen.secondstring.tokens.SimpleTokenizer;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.ExactFieldMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.PageMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.YearsWithinFivePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.FieldStringDistancePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.*;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEEvaluator;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.CRFIO;//import edu.umass.cs.mallet.users.hay.canopy.Util;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructor;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructorSimple;//import edu.umass.cs.mallet.users.hay.canopy.IndexFiles;//import edu.umass.cs.mallet.users.hay.canopy.CanopyMaker;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructorAuthDateTitle;//import salvo.jesus.graph.WeightedGraph;//import org.apache.lucene.analysis.Analyzer;//import org.apache.lucene.analysis.SimpleAnalyzer;/** Clusters papers and venues jointly using MultipleCorefCluterer */public class ClusterPapersAndVenues{ private static String[] SEPERATOR = new String[] {"<NEW_HEADER>", "<NEWREFERENCE>"}; private static CRF crf = null; private static Pipe pipe; private static IEInterface ieInterface; private static IEInterface ieInterface1; private static IEInterface ieInterface2; private static IEInterface ieInterface3; private static IEInterface ieInterface4; private static StringDistance softtfidfPaper; private static StringDistance tfidfPaper; private static StringDistance tfidfVenue; private static Jaccard distanceMetricEditDistPaper; private static StringDistance triGramDistanceMetricPaper; private static StringDistance triGramDistanceMetricVenue; static CommandOption.Boolean fullPartition = new CommandOption.Boolean (ClusterPapersAndVenues.class, "full-partition", "FILENAME", false, false, "Use full partitioninig", null); static CommandOption.Boolean useWeightedAvg = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-weighted-avg", "FILENAME", false, false, "Use weighted average", null); static CommandOption.String loadMEFile = new CommandOption.String (ClusterPapersAndVenues.class, "load-me-file", "FILENAME", true, null, "The name of the MaxEnt model file.", null); static CommandOption.String outputFile = new CommandOption.String (ClusterPapersAndVenues.class, "output-file", "FILENAME", true, null, "The name of the file where output clusters will be printed to.", null); static CommandOption.String crfInputFile = new CommandOption.String (ClusterPapersAndVenues.class, "crf-input-file", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.String crfInputFile1 = new CommandOption.String (ClusterPapersAndVenues.class, "crf-input-file-1", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.String crfInputFile2 = new CommandOption.String (ClusterPapersAndVenues.class, "crf-input-file-2", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.String crfInputFile3 = new CommandOption.String (ClusterPapersAndVenues.class, "crf-input-file-3", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.String crfInputFile4 = new CommandOption.String (ClusterPapersAndVenues.class, "crf-input-file-4", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.Boolean useCRF = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-crf", "BOOL", false, false, "Use CRF or not.", null); static CommandOption.Boolean useMultipleCRFs = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-multiple-crfs", "BOOL", false, false, "Use a separate crf for each data segment or not.", null); static CommandOption.Boolean useTreeModel = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-tree-model", "BOOL", false, false, "Use and train tree model.", null); static CommandOption.Boolean useCorrelational = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-correlational", "BOOL", false, false, "Use Correlational Clustering or not, if not uses Greedy.", null); static CommandOption.Boolean useFeatureInduction = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-feature-induction", "BOOL", false, false, "Use Feature Induction or Not.", null); static CommandOption.Boolean useNBest = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-n-best", "BOOL", false, false, "Use NBest or not.", null); static CommandOption.Boolean useTrueNumClusters = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-true-num-clusters", "BOOL", false, false, "Use NBest or not.", null); static CommandOption.Boolean useOptimal = new CommandOption.Boolean (ClusterPapersAndVenues.class, "use-optimal", "BOOL", false, false, "Use NBest or not.", null); static CommandOption.Integer optimalNBest = new CommandOption.Integer (ClusterPapersAndVenues.class, "optimal-n-best", "INTEGER", true, -1, "Size of n, for searching for optimal n-best configuration.", null); static CommandOption.Integer rBeamSize = new CommandOption.Integer (ClusterPapersAndVenues.class, "r-beam-size", "INTEGER", true, 10, "Size of n, for searching for optimal n-best configuration.", null); static CommandOption.String trainingDir1 = new CommandOption.String (ClusterPapersAndVenues.class, "training-dir-1", "FILENAME", true, null, "Directory containing training files.", null); static CommandOption.String trainingDir2 = new CommandOption.String (ClusterPapersAndVenues.class, "training-dir-2", "FILENAME", true, null, "Directory containing training files.", null); static CommandOption.String trainingDir3 = new CommandOption.String (ClusterPapersAndVenues.class, "training-dir-3", "FILENAME", true, null, "Directory containing training files.", null); static CommandOption.String testingDir = new CommandOption.String (ClusterPapersAndVenues.class, "testing-dir", "FILENAME", true, null, "Directory containing testing files.", null); static CommandOption.Integer searchIters = new CommandOption.Integer (ClusterPapersAndVenues.class, "search-iters", "INTEGER", true, 3, "Number of search iterations.", null); static CommandOption.Integer searchReductions = new CommandOption.Integer (ClusterPapersAndVenues.class, "search-reductions", "INTEGER", true, 5, "Number of search reductions.", null); static CommandOption.Integer numNBest = new CommandOption.Integer (ClusterPapersAndVenues.class, "num-n-best", "INTEGER", true, 3, "Number of n-best candidates to store.", null); static CommandOption.Integer nthViterbi = new CommandOption.Integer (ClusterPapersAndVenues.class, "nth-viterbi", "INTEGER", true, 0, "Number of n-best candidates to use .", null); static CommandOption.Boolean trainUsingLabeled = new CommandOption.Boolean (ClusterPapersAndVenues.class, "train-using-labeled", "BOOL", true, false, "Train just using the labeled data, but test on CRF output", null); static final CommandOption.List commandOptions = new CommandOption.List ( "Training, testing and running information extraction on paper header or reference.", new CommandOption[] { useWeightedAvg, trainUsingLabeled, rBeamSize, loadMEFile, useTreeModel, fullPartition, outputFile, useOptimal, crfInputFile, crfInputFile1, crfInputFile2, crfInputFile3, crfInputFile4, useCRF, useMultipleCRFs, useFeatureInduction, useCorrelational, useNBest, optimalNBest, useTrueNumClusters, trainingDir1, trainingDir2, trainingDir3, testingDir, searchIters, searchReductions, numNBest, nthViterbi }); private static Logger logger = MalletLogger.getLogger (ClusterPapersAndVenues.class.getName()); public static void main (String[] args) throws FileNotFoundException { commandOptions.process (args); commandOptions.logOptions (logger); boolean newCluster = true; loadCRFs(); // create paper nodes logger.info ("Creating Paper Nodes"); ArrayList[] paperTrainingNodes = createNodesFromTraining (CitationUtils.PAPER); ArrayList paperTestingNodes = createNodesFromTesting (CitationUtils.PAPER); ArrayList allPaperTrainingNodes = new ArrayList (); for (int i=0; i < paperTrainingNodes.length; i++) allPaperTrainingNodes.addAll (paperTrainingNodes[i]); System.out.println("finished computing nodes for PAPER, about to compute distanceMetric params "); triGramDistanceMetricPaper = getDistanceMetric (allPaperTrainingNodes); AbstractStatisticalTokenDistance distanceMetricPaper = (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (allPaperTrainingNodes); Pipe paperPipe = getPaperPipe(distanceMetricPaper, triGramDistanceMetricPaper); InstanceList paperTraining = getTrainingList (paperTrainingNodes, paperPipe); InstanceList paperTesting = CitationUtils.makePairs(paperPipe, paperTestingNodes); //Collection paperKey = CitationUtils.makeCollections(allPaperTrainingNodes); // make key collections Collection paperTestKey = CitationUtils.makeCollections(paperTestingNodes); // create venue nodes logger.info ("Creating Venue Nodes"); ArrayList[] venueTrainingNodes = createNodesFromTraining (CitationUtils.VENUE); ArrayList venueTestingNodes = createNodesFromTesting (CitationUtils.VENUE); ArrayList allVenueTrainingNodes = new ArrayList (); for (int i=0; i < venueTrainingNodes.length; i++) allVenueTrainingNodes.addAll (venueTrainingNodes[i]); System.out.println("finished computing nodes for VENUE, about to compute distanceMetric params "); triGramDistanceMetricVenue = getDistanceMetric (allVenueTrainingNodes); AbstractStatisticalTokenDistance distanceMetricVenue = (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (allVenueTrainingNodes); Pipe venuePipe = getVenuePipe(distanceMetricVenue, triGramDistanceMetricVenue); InstanceList venueTraining = getTrainingList (venueTrainingNodes, venuePipe); InstanceList venueTesting = CitationUtils.makePairs(venuePipe, venueTestingNodes); //Collection venueKey = CitationUtils.makeCollections(allVenueTrainingNodes); // make key collections Collection venueTestKey = CitationUtils.makeCollections(venueTestingNodes); FeatureInducer fi = null; /* // try doing some feature induction now if (useFeatureInduction.value()) { RankedFeatureVector.Factory gainFactory = null; gainFactory = new InfoGain.Factory(); fi = new FeatureInducer (gainFactory, paperTraining, 10); fi.induceFeaturesFor(paperTraining, false, false); } */ TreeModel tmodel = null; if (useTreeModel.value()) { throw new UnsupportedOperationException ("Tree model not supported yet."); /*if (pubs2 != null && pubs3 != null) { tmodel = new TreeModel(paperInstancePipe, paperTrainingNodes[0], paperTrainingNodes[1], paperTrainingNodes[2], pubs1, pubs3, pubs3); } else { tmodel = new TreeModel(instancePipe, nodes1, pubs1); }*/ //tmodel.setMultiTree (true); } /* if (useFeatureInduction.value()) { System.out.println("\n\nINDUCING FEATURES FOR TEST INSTANCES"); fi.induceFeaturesFor(paperTesting, false, false); } */ MultipleCorefClusterer cl = null; MultipleCorefClusterer paperCl = null; MultipleCorefClusterer venueCl = null; if (newCluster) { cl = new MultipleCorefClusterer(new Pipe[] {paperPipe, venuePipe}); paperCl = new MultipleCorefClusterer(new Pipe[] {paperPipe}); venueCl = new MultipleCorefClusterer(new Pipe[] {venuePipe}); initializeClusterer (cl); initializeClusterer (paperCl); initializeClusterer (venueCl); if (loadMEFile.value() != null) { throw new UnsupportedOperationException ("Loading MaxEnt not implemented yet"); //cl.loadME(loadMEFile.value()); } else { cl.train(new InstanceList[] {paperTraining, venueTraining}); paperCl.train(new InstanceList[] {paperTraining}); venueCl.train(new InstanceList[] {venueTraining}); } cl.testClassifiers(new InstanceList[] {paperTesting, venueTesting}); paperCl.testClassifiers(new InstanceList[] {paperTesting}); venueCl.testClassifiers(new InstanceList[] {venueTesting}); } Collection[] testKeys = new Collection[] {paperTestKey, venueTestKey}; Collection[] paperTestKeys = new Collection[] {paperTestKey}; Collection[] venueTestKeys = new Collection[] {venueTestKey}; // xxx keyPartitioning not implemented correctly in MultipleCorefClusterer cl.setKeyPartitioning (paperTestKey); if (newCluster) { Collection[] testS = cl.clusterMentions(new InstanceList[] {paperTesting, venueTesting}, new List[] {paperTestingNodes, venueTestingNodes}, -1, useCorrelational.value()); logger.info ("Evaluating " + testS.length + " type(s) of clusterings"); for (int ti=0; ti < testS.length; ti++) { CitationUtils.evaluateClustering (testKeys[ti], testS[ti], String.valueOf(ti) + " JOINT COREFERENCE RESULTS"); if (outputFile.value() != null) printClustersToFile (testS[ti], outputFile.value() + "_" + String.valueOf(ti)); } Collection[] paperTestS = paperCl.clusterMentions(new InstanceList[] {paperTesting}, new List[] {paperTestingNodes}, -1, useCorrelational.value()); Collection[] venueTestS = venueCl.clusterMentions(new InstanceList[] {venueTesting}, new List[] {venueTestingNodes}, -1, useCorrelational.value()); CitationUtils.evaluateClustering (paperTestKeys[0], paperTestS[0], "SOLO PAPER COREFERENCE RESULTS"); CitationUtils.evaluateClustering (venueTestKeys[0], venueTestS[0], "SOLO VENUE COREFERENCE RESULTS"); } } private static void initializeClusterer (MultipleCorefClusterer cl) { cl.setTrueNumStop (useTrueNumClusters.value()); cl.setConfWeightedScores(useWeightedAvg.value()); cl.setOptimality (useOptimal.value()); cl.setRBeamSize (rBeamSize.value()); cl.setNBestInference (useNBest.value()); // actually use n-best list in //coref cl.setFullPartition(fullPartition.value()); int si = searchIters.value(); int sd = searchReductions.value(); cl.setSearchParams (si, sd); } private static void loadCRFs () { if (useCRF.value() == true) { if (useMultipleCRFs.value() == true) { System.out.println("Initializing CRF"); File crfFile1 = new File(crfInputFile1.value());
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -