📄 jointconditionalclusterertui.java
字号:
/* Copyright (C) 2002 Dept. of Computer Science, Univ. of Massachusetts, Amherst This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This program toolkit free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For more details see the GNU General Public License and the file README-LEGAL. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *//** @author Aron Culotta */package edu.umass.cs.mallet.projects.seg_plus_coref.condclust.tui;import edu.umass.cs.mallet.projects.seg_plus_coref.condclust.cluster.*;import edu.umass.cs.mallet.projects.seg_plus_coref.condclust.pipe.*;import edu.umass.cs.mallet.projects.seg_plus_coref.condclust.pipe.iterator.*;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.util.*;import com.wcohen.secondstring.*;import com.wcohen.secondstring.tokens.NGramTokenizer;import com.wcohen.secondstring.tokens.SimpleTokenizer;import java.util.logging.*;import java.util.*;import java.util.regex.*;import java.io.*;/** Interface to train and test a ConditionalClusterer to cluster * Papers and Venues simultaneously. Uses Citeseer data. */ public class JointConditionalClustererTUI { private static Logger logger = MalletLogger.getLogger(JointConditionalClustererTUI.class.getName()); static CommandOption.SpacedStrings trainingDirs = new CommandOption.SpacedStrings (JointConditionalClustererTUI.class, "training-dirs", "DIR...", true, null, "The directories containing the citations to be clustered at training time. One file per cluster.", null); static CommandOption.SpacedStrings testingDirs = new CommandOption.SpacedStrings (JointConditionalClustererTUI.class, "testing-dirs", "DIR...", true, null, "The directories containing the citations to be clustered at test time. One file per cluster.", null); static CommandOption.Boolean randomOrderClustering = new CommandOption.Boolean (JointConditionalClustererTUI.class, "random-order-clustering", "BOOL", false, false, "At test time, choose the nodes to consider at random", null); static CommandOption.Boolean sampleTrainingInstances = new CommandOption.Boolean (JointConditionalClustererTUI.class, "sample-training-instances", "BOOL", false, true, "Generate instances by sampling from true clusters", null); static CommandOption.Integer numberTrainingInstances = new CommandOption.Integer (JointConditionalClustererTUI.class, "number-training-instances", "INTEGER", true, 5000, "The number of training instances to sample", null); static CommandOption.Integer randomSeed = new CommandOption.Integer (JointConditionalClustererTUI.class, "random-seed", "INTEGER", true, 1, "Seed for random number in random order clustering", null); static CommandOption.Integer numRandomTrials = new CommandOption.Integer (JointConditionalClustererTUI.class, "num-random-trials", "INTEGER", true, 5, "number of random trials to run", null); static CommandOption.Boolean errorAnalysis = new CommandOption.Boolean (JointConditionalClustererTUI.class, "error-analysis", "BOOL", false, false, "Print errors (False positives)", null); static CommandOption.Boolean useCRF = new CommandOption.Boolean (JointConditionalClustererTUI.class, "use-crf", "BOOL", false, false, "Use CRF or not.", null); static CommandOption.Boolean useFeatureInduction = new CommandOption.Boolean (JointConditionalClustererTUI.class, "use-feature-induction", "BOOL", false, false, "Use Feature Induction or Not.", null); static CommandOption.Boolean useClusterSize = new CommandOption.Boolean (JointConditionalClustererTUI.class, "use-cluster-size", "BOOL", true, true, "add feature that is cluster's size", null); static CommandOption.Boolean useThereExists = new CommandOption.Boolean (JointConditionalClustererTUI.class, "use-there-exists", "BOOL", true, true, "Use thereExists pipe.", null); static CommandOption.Boolean usePairwiseClassifier = new CommandOption.Boolean (JointConditionalClustererTUI.class, "use-pairwise-classifier", "BOOL", true, true, "Use pairwise classifier to weight edges.", null); static CommandOption.Boolean useClusterHomogeneity = new CommandOption.Boolean (JointConditionalClustererTUI.class, "use-cluster-homogeneity", "BOOL", true, true, "add feature that is within-cluster similarity.", null); static CommandOption.Boolean printInputAndTarget = new CommandOption.Boolean (JointConditionalClustererTUI.class, "print-input-and-target", "BOOL", false, false, "Print features and target.", null); static CommandOption.String crfInputFile = new CommandOption.String (JointConditionalClustererTUI.class, "crf-input-file", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.Integer numNBest = new CommandOption.Integer (JointConditionalClustererTUI.class, "num-n-best", "INTEGER", true, 3, "Number of n-best candidates to store.", null); static CommandOption.Integer nthViterbi = new CommandOption.Integer (JointConditionalClustererTUI.class, "nth-viterbi", "INTEGER", true, 0, "Number of n-best candidates to use .", null); static CommandOption.Double negativeClusterThreshold = new CommandOption.Double (JointConditionalClustererTUI.class, "negative-cluster-threshold", "DECIMAL", true, 0.0, "Decision threhold to place a node in a cluster. Takes opposite of input because CommandOptions seem to have trouble with negative inputs", null); static CommandOption.Double positiveInstanceRatio = new CommandOption.Double (JointConditionalClustererTUI.class, "positive-instance-ratio", "DECIMAL", true, 0.1, "Ratio of positive to negative training instances", null); static final CommandOption.List commandOptions = new CommandOption.List ( "Training and testing a conditional clusterer.", new CommandOption[] { trainingDirs, testingDirs, sampleTrainingInstances, numberTrainingInstances, errorAnalysis, useCRF, useFeatureInduction, crfInputFile, numNBest, nthViterbi, negativeClusterThreshold, randomOrderClustering, randomSeed, numRandomTrials, usePairwiseClassifier, useThereExists, useClusterSize, useClusterHomogeneity, printInputAndTarget, positiveInstanceRatio }); public static void main (String[] args) { commandOptions.process (args); commandOptions.logOptions (logger); IEInterface ieInterface = loadIEInterface (); // load papers ArrayList[] paperTrainingNodes = createNodesFromFiles (trainingDirs.value(), ieInterface, CitationUtils.PAPER); ArrayList[] paperTestingNodes = createNodesFromFiles (testingDirs.value(), ieInterface, CitationUtils.PAPER); ArrayList allPaperTrainingNodes = new ArrayList(); for (int i=0; i < paperTrainingNodes.length; i++) allPaperTrainingNodes.addAll (paperTrainingNodes[i]); ArrayList allPaperTestingNodes = new ArrayList(); for (int i=0; i < paperTestingNodes.length; i++) allPaperTestingNodes.addAll (paperTestingNodes[i]); Collection paperTrainingTruth = CitationUtils.makeCollections (allPaperTrainingNodes); Collection paperTestingTruth = CitationUtils.makeCollections (allPaperTestingNodes); // load venues ArrayList[] venueTrainingNodes = createNodesFromFiles (trainingDirs.value(), ieInterface, CitationUtils.VENUE); ArrayList[] venueTestingNodes = createNodesFromFiles (testingDirs.value(), ieInterface, CitationUtils.VENUE); ArrayList allVenueTrainingNodes = new ArrayList(); for (int i=0; i < venueTrainingNodes.length; i++) allVenueTrainingNodes.addAll (venueTrainingNodes[i]); ArrayList allVenueTestingNodes = new ArrayList(); for (int i=0; i < venueTestingNodes.length; i++) allVenueTestingNodes.addAll (venueTestingNodes[i]); Collection venueTrainingTruth = CitationUtils.makeCollections (allVenueTrainingNodes); Collection venueTestingTruth = CitationUtils.makeCollections (allVenueTestingNodes); // train pairwise classifiers Classifier paperPairwiseClassifier = null; Classifier venuePairwiseClassifier = null; if (usePairwiseClassifier.value()) { System.err.println ("TRAINING PAIRWISE CLASSIFIERS"); AbstractStatisticalTokenDistance distanceMetric = (AbstractStatisticalTokenDistance)CitationUtils.computeDistanceMetric (allPaperTrainingNodes); TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF triGramDistanceMetric = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allPaperTrainingNodes, tfidf, triGramDistanceMetric); paperPairwiseClassifier = trainPairwiseClassifier (paperTrainingNodes, getPaperPipe(distanceMetric, triGramDistanceMetric)); venuePairwiseClassifier = trainPairwiseClassifier (venueTrainingNodes, getVenuePipe(distanceMetric, triGramDistanceMetric)); } // train solo clusterers AbstractPipeInputIterator paperInstanceIterator = new NodeClusterPairIterator (paperTrainingTruth, new java.util.Random (randomSeed.value()), positiveInstanceRatio.value(), sampleTrainingInstances.value(), numberTrainingInstances.value()); AbstractPipeInputIterator venueInstanceIterator = new NodeClusterPairIterator (venueTrainingTruth, new java.util.Random
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -