📄 tui_corefie.java
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.projects.seg_plus_coref.ie;import bsh.EvalError;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.AbstractPipeInputIterator;import edu.umass.cs.mallet.base.pipe.iterator.LineGroupIterator;import edu.umass.cs.mallet.base.pipe.tsf.LexiconMembership;import edu.umass.cs.mallet.base.pipe.tsf.OffsetConjunctions;import edu.umass.cs.mallet.base.pipe.tsf.RegexMatches;import edu.umass.cs.mallet.base.pipe.tsf.TokenText;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.util.CharSequenceLexer;import edu.umass.cs.mallet.base.util.CommandOption;import edu.umass.cs.mallet.base.util.MalletLogger;import edu.umass.cs.mallet.projects.seg_plus_coref.BaseTUICRF;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.LineGroupIterator2;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.SGML2FieldsPipe;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.Reader;import java.util.*;import java.util.logging.Logger;import java.util.regex.Pattern;import java.util.regex.Matcher;import gnu.trove.TIntArrayList;/** * Created: May 22, 2004 * * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: TUI_CorefIE.java,v 1.14 2004/06/15 14:39:58 casutton Exp $ */public class TUI_CorefIE extends BaseTUICRF { private static String[] SEPARATORS = new String[]{"<NEW_HEADER>", "<NEWREFERENCE>"};// private static String[] SEPERATOR = new String[] {"<NEW_HEADER>", "^$"}; private static final Logger logger = MalletLogger.getLogger(TUI_CorefIE.class.getName()); static CommandOption.File crfInputFileOption = new CommandOption.File (TUI_CorefIE.class, "crf-input-file", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.File inputFileOption = new CommandOption.File (TUI_CorefIE.class, "input-file", "FILENAME", true, null, "The name of the file containing the testing data.", null); static CommandOption.Integer headOrRefOption = new CommandOption.Integer (TUI_CorefIE.class, "head-or-ref", "INTEGER", true, 0, "0 for header, 1 for reference", null); static CommandOption.Integer nBestChoice = new CommandOption.Integer (TUI_CorefIE.class, "nbestchoice", "INTEGER", true, 1, "N for N-best", null); static CommandOption.Boolean includeBibtexLexicons = new CommandOption.Boolean (TUI_CorefIE.class, "include-bibtex-lexicons", "INTEGER", true, false, "Whether to use BibTeX lexicons from Fuchun.", null); static CommandOption.Boolean excludingSingletons = new CommandOption.Boolean (TUI_CorefIE.class, "exclude-singletons", "boolean", true, true, "excluding singletons.", null); static CommandOption.Boolean useClusterFeatures = new CommandOption.Boolean (TUI_CorefIE.class, "use-cluster-features", "boolean", true, true, "excluding singletons.", null); static CommandOption.Boolean useNegativeClusterFeatures = new CommandOption.Boolean (TUI_CorefIE.class, "use-negative-cluster-features", "boolean", true, false, "Whether to use features that say words AREN'T tagged in cluster.", null); static CommandOption.Boolean useNumClusterOccurences = new CommandOption.Boolean (TUI_CorefIE.class, "use-cluster-occurrences", "boolean", true, false, "Whether to use number of tagged word occurrences in cluster as features.", null); static CommandOption.Boolean useBogusClusterFeatures = new CommandOption.Boolean (TUI_CorefIE.class, "use-bogus-cluster-features", "boolean", true, false, "If true, use features from the instance's true segmentation.", null); static CommandOption.Boolean useSparseWeights = new CommandOption.Boolean (TUI_CorefIE.class, "use-sparse-weights", "boolean", true, false, "If true, use only input features that appear in training set.", null); static CommandOption.Integer clusterFeatureMinimum = new CommandOption.Integer (TUI_CorefIE.class, "cluster-feature-minimum", "INTEGER", true, 2, "Minimum number of coreferent citations that need to agree to create a cluster feature.", null); static CommandOption.Integer clusterSizeLimit = new CommandOption.Integer (TUI_CorefIE.class, "cluster-size-limit", "INTEGER", true, 10000, "cluster Size Limit", null); static CommandOption.Integer methodChoice = new CommandOption.Integer (TUI_CorefIE.class, "methodchoice", "INTEGER", true, 1, "method for canonical citation creation", null); static CommandOption.Integer markovOrder = new CommandOption.Integer (TUI_CorefIE.class, "markov-order", "INTEGER", true, 0, "0 = states for all transitions, 1 = half labels, 2 = three-quarter labels", null); static CommandOption.Integer numRepsOption = new CommandOption.Integer (TUI_CorefIE.class, "num-reps", "INTEGER", true, 5, "Number of random test-training splits to try.", null); static String refNoMeta = "reference_no="; static String clusterNoMeta = "cluster_no="; static String[] FIELD_NAMES = new String[] {"author", "title", "date", "publisher", "location", "pages", "institution", "editor", "volume", "note", "booktitle", "tech", "journal"}; static String[] startTags = new String[] {"<author>", "<title>", "<date>", "<publisher>", "<location>", "<pages>", "<institution>", "<editor>", "<volume>", "<note>", "<booktitle>", "<tech>", "<journal>"}; static String[] endTags = new String[] {"</author>", "</title>", "</date>", "</publisher>", "</location>", "</pages>", "</institution>", "</editor>", "</volume>", "</note>", "</booktitle>", "</tech>", "</journal>"}; static double[] tagWeight = new double[]{1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; static int NumFields = 10; private static String separator; public static void main (String[] args) throws Exception { CommandOption.List options = new CommandOption.List("Segmenting references based on coreference information.", new CommandOption[0]); options.add(TUI_CorefIE.class); options.add(BaseTUICRF.class); options.process(args); initOutputDirectory (); options.logOptions(logger); long timeStart = System.currentTimeMillis(); separator = SEPARATORS[headOrRefOption.value()]; Random r = new Random (randomSeedOption.value); for (int rep = 0; rep < numRepsOption.value; rep++) { logger.info ("REPETITION "+rep); // This is convoluted because the instances need to be split by cluster InstanceList rawList = new InstanceList(new Alphabet(), new Alphabet()); rawList.add(new LineGroupIterator(new FileReader(inputFileOption.value), Pattern.compile(separator), true)); InstanceList rawClusters = getInstanceListClusters(rawList, inputFileOption.value); InstanceList[] trainTest = rawClusters.split(r, new double[] { trainingPct.value, 1 - trainingPct.value, }); InstanceList trainClusters = trainTest[0]; InstanceList testClusters = trainTest[1]; logger.info("Num train clusters = " + trainClusters.size()); logger.info("Num test clusters = " + testClusters.size());// System.out.println("Train clusters: ");// printClusterList (trainClusters);// System.out.println("\n\n\nTest clusters: ");// printClusterList (testClusters); System.out.println("Creating allclustersegmentation"); Pipe basePipe = makeBasePipe(); AllClusterSegmentation allseg = new AllClusterSegmentation(rawClusters, basePipe);// System.out.println("ALL SEGMENTATION: ");// allseg.print (); //TODO: Add loop, coreferent segmentation aware pipe Pipe segmentationsPipe = makeSegmentationsPipe (allseg, useClusterFeatures.value, useNumClusterOccurences.value, useBogusClusterFeatures.value); Pipe thePipe = new SerialPipes(new Pipe[]{ makeBasePipe(), segmentationsPipe, new TokenSequence2FeatureVectorSequence(),// new PrintInputAndTarget(), }); InstanceList training = new InstanceList(thePipe); training.add(new ClusterListIterator(trainClusters)); InstanceList testing = new InstanceList(thePipe); testing.add(new ClusterListIterator(testClusters)); logger.info ("Number of training instances = "+training.size ()); logger.info ("Number of testing instances = "+testing.size ()); CRF4 crf = new CRF4(thePipe, null); crf.setUseSparseWeights(useSparseWeights.value); switch (markovOrder.value) { case 0: crf.addStatesForLabelsConnectedAsIn(training); break; case 1: crf.addStatesForHalfLabelsConnectedAsIn(training); break; case 2: crf.addStatesForThreeQuarterLabelsConnectedAsIn(training); break; default: System.err.println("Unknown markov-order "+markovOrder.value); System.exit (1); } InstanceList nontrivialTest = getNonTrivialTesting (thePipe, testClusters); TransducerEvaluator eval = new TokenAccuracyEvaluator(); eval.setNumIterationsToWait(10); eval.setNumIterationsToSkip(5); crf.train(training, null, testing, eval); TransducerEvaluator fieldEval = new FieldF1Evaluator(FIELD_NAMES); fieldEval.test(crf, training, "Training", null); fieldEval.test(crf, testing, "Testing", null); fieldEval.test(crf, nontrivialTest, "Clusters>=3", null); TransducerEvaluator perClassEval = new PerClassAccuracyEvaluator(); perClassEval.test(crf, training, "Training", null); perClassEval.test(crf, testing, "Testing", null); perClassEval.test(crf, nontrivialTest, "Clusters>=3", null); TransducerEvaluator instanceEval = new InstanceAccuracyEvaluator(); instanceEval.test(crf, training, "Training", null); instanceEval.test(crf, testing, "Testing", null); instanceEval.test(crf, nontrivialTest, "Clusters>=3", null); writeOutput(crf, training, "-train-" + rep); writeOutput(crf, testing, "-test-" + rep); writeOutput(crf, nontrivialTest, "test-gt-3-" + rep); writeCrf(crf, "-" + rep);// saveCrf(crf, "-" + rep); } /* Sequence[] oldPaths = null; Sequence[] paths = null; do { oldPaths = null; paths = getViterbiPaths () } while ((oldPaths == null) || !(pathsEqual (oldPaths, paths)) ); */ /* CorefIE corefIE = new CorefIE(crfInputFileOption.value, refNoMeta, clusterNoMeta, startTags, endTags, tagWeight, NumFields); corefIE.loadCRF(); corefIE.canonicalInstanceMaker(); corefIE.setClusterSizeLimit(clusterSizeLimit.value()); int N = nBestChoice.value();//1; corefIE.setMethod(methodChoice.value);// 1, 2, 3 System.out.println("N=" + N); System.out.println("Canonical citation creation method: " + methodChoice.value()); System.out.println("Excluding singletons: " + excludingSingletons.value()); System.out.println("clusterSizeLimit: " + clusterSizeLimit.value()); corefIE.viterbiCRF_SingleFile(ttestFileOption.value, true, SEPERATOR[headOrRefOption.value()], N, excludingSingletons.value()); */ long timeEnd = System.currentTimeMillis(); double timeElapse = (timeEnd - timeStart) / (1000.000); System.out.println("Time elapses " + timeElapse + " seconds for testing."); } private static InstanceList getNonTrivialTesting (Pipe pipe, InstanceList clusterList) { InstanceList goodClusters = new InstanceList (new Alphabet(), new Alphabet()); for (InstanceList.Iterator it = clusterList.iterator(); it.hasNext();) { Instance instance = (Instance) it.next(); InstanceList cluster = (InstanceList) instance.getData (); if (cluster.size() > 2) { goodClusters.add (instance); } } InstanceList goodInstances = new InstanceList (pipe); goodInstances.add (new ClusterListIterator(goodClusters)); return goodInstances; } private static String CAPS = "[A-Z珀蜷駛靅"; private static String ALPHA = "[A-Z珀蜷駛靉-z垞潕棈昡"; private static String ALPHANUM = "[A-Z珀蜷駛靉-z垞潕棈
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -