⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tui_corefie.java

📁 这是一个matlab的java实现。里面有许多内容。请大家慢慢捉摸。
💻 JAVA
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.projects.seg_plus_coref.ie;import bsh.EvalError;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.AbstractPipeInputIterator;import edu.umass.cs.mallet.base.pipe.iterator.LineGroupIterator;import edu.umass.cs.mallet.base.pipe.tsf.LexiconMembership;import edu.umass.cs.mallet.base.pipe.tsf.OffsetConjunctions;import edu.umass.cs.mallet.base.pipe.tsf.RegexMatches;import edu.umass.cs.mallet.base.pipe.tsf.TokenText;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.util.CharSequenceLexer;import edu.umass.cs.mallet.base.util.CommandOption;import edu.umass.cs.mallet.base.util.MalletLogger;import edu.umass.cs.mallet.projects.seg_plus_coref.BaseTUICRF;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.LineGroupIterator2;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.SGML2FieldsPipe;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.Reader;import java.util.*;import java.util.logging.Logger;import java.util.regex.Pattern;import java.util.regex.Matcher;import gnu.trove.TIntArrayList;/** * Created: May 22, 2004 *  * @author <A HREF="mailto:casutton@cs.umass.edu>casutton@cs.umass.edu</A> * @version $Id: TUI_CorefIE.java,v 1.14 2004/06/15 14:39:58 casutton Exp $ */public class TUI_CorefIE extends BaseTUICRF {  private static String[] SEPARATORS = new String[]{"<NEW_HEADER>", "<NEWREFERENCE>"};//	private static String[] SEPERATOR = new String[] {"<NEW_HEADER>", "^$"};  private static final Logger logger = MalletLogger.getLogger(TUI_CorefIE.class.getName());  static CommandOption.File crfInputFileOption = new CommandOption.File          (TUI_CorefIE.class, "crf-input-file", "FILENAME", true, null,                  "The name of the file to read the trained CRF for testing.", null);  static CommandOption.File inputFileOption = new CommandOption.File          (TUI_CorefIE.class, "input-file", "FILENAME", true, null,                  "The name of the file containing the testing data.", null);  static CommandOption.Integer headOrRefOption = new CommandOption.Integer          (TUI_CorefIE.class, "head-or-ref", "INTEGER", true, 0,                  "0 for header, 1 for reference", null);  static CommandOption.Integer nBestChoice = new CommandOption.Integer          (TUI_CorefIE.class, "nbestchoice", "INTEGER", true, 1,                  "N for N-best", null);  static CommandOption.Boolean includeBibtexLexicons = new CommandOption.Boolean          (TUI_CorefIE.class, "include-bibtex-lexicons", "INTEGER", true, false,                  "Whether to use BibTeX lexicons from Fuchun.", null);  static CommandOption.Boolean excludingSingletons = new CommandOption.Boolean          (TUI_CorefIE.class, "exclude-singletons", "boolean", true, true,                  "excluding singletons.", null);  static CommandOption.Boolean useClusterFeatures = new CommandOption.Boolean          (TUI_CorefIE.class, "use-cluster-features", "boolean", true, true,                  "excluding singletons.", null);  static CommandOption.Boolean useNegativeClusterFeatures = new CommandOption.Boolean          (TUI_CorefIE.class, "use-negative-cluster-features", "boolean", true, false,                  "Whether to use features that say words AREN'T tagged in cluster.", null);  static CommandOption.Boolean useNumClusterOccurences = new CommandOption.Boolean          (TUI_CorefIE.class, "use-cluster-occurrences", "boolean", true, false,                  "Whether to use number of tagged word occurrences in cluster as features.", null);  static CommandOption.Boolean useBogusClusterFeatures = new CommandOption.Boolean          (TUI_CorefIE.class, "use-bogus-cluster-features", "boolean", true, false,                  "If true, use features from the instance's true segmentation.", null);  static CommandOption.Boolean useSparseWeights = new CommandOption.Boolean          (TUI_CorefIE.class, "use-sparse-weights", "boolean", true, false,                  "If true, use only input features that appear in training set.", null);  static CommandOption.Integer clusterFeatureMinimum = new CommandOption.Integer          (TUI_CorefIE.class, "cluster-feature-minimum", "INTEGER", true, 2,                  "Minimum number of coreferent citations that need to agree to create a cluster feature.", null);  static CommandOption.Integer clusterSizeLimit = new CommandOption.Integer          (TUI_CorefIE.class, "cluster-size-limit", "INTEGER", true, 10000,                  "cluster Size Limit", null);  static CommandOption.Integer methodChoice = new CommandOption.Integer          (TUI_CorefIE.class, "methodchoice", "INTEGER", true, 1,                  "method for canonical citation creation", null);  static CommandOption.Integer markovOrder = new CommandOption.Integer          (TUI_CorefIE.class, "markov-order", "INTEGER", true, 0,                  "0 = states for all transitions, 1 = half labels, 2 = three-quarter labels", null);  static CommandOption.Integer numRepsOption = new CommandOption.Integer          (TUI_CorefIE.class, "num-reps", "INTEGER", true, 5,                  "Number of random test-training splits to try.", null);  static String refNoMeta = "reference_no=";  static String clusterNoMeta = "cluster_no=";  static String[] FIELD_NAMES = new String[]  {"author", "title", "date", "publisher", "location", "pages",   "institution", "editor", "volume", "note", "booktitle", "tech", "journal"};  static String[] startTags = new String[]  {"<author>", "<title>", "<date>", "<publisher>", "<location>", "<pages>",   "<institution>", "<editor>", "<volume>", "<note>", "<booktitle>", "<tech>", "<journal>"};  static String[] endTags = new String[]  {"</author>", "</title>", "</date>", "</publisher>", "</location>", "</pages>",   "</institution>", "</editor>", "</volume>", "</note>", "</booktitle>", "</tech>", "</journal>"};  static double[] tagWeight = new double[]{1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};  static int NumFields = 10;  private static String separator;  public static void main (String[] args) throws Exception  {    CommandOption.List options = new CommandOption.List("Segmenting references based on coreference information.",            new CommandOption[0]);    options.add(TUI_CorefIE.class);    options.add(BaseTUICRF.class);    options.process(args);    initOutputDirectory ();    options.logOptions(logger);    long timeStart = System.currentTimeMillis();    separator = SEPARATORS[headOrRefOption.value()];    Random r = new Random (randomSeedOption.value);    for (int rep = 0; rep < numRepsOption.value; rep++) {      logger.info ("REPETITION "+rep);      // This is convoluted because the instances need to be split by cluster      InstanceList rawList = new InstanceList(new Alphabet(), new Alphabet());      rawList.add(new LineGroupIterator(new FileReader(inputFileOption.value), Pattern.compile(separator), true));      InstanceList rawClusters = getInstanceListClusters(rawList, inputFileOption.value);      InstanceList[] trainTest = rawClusters.split(r, new double[] {          trainingPct.value,          1 - trainingPct.value, });      InstanceList trainClusters = trainTest[0];      InstanceList testClusters = trainTest[1];      logger.info("Num train clusters = " + trainClusters.size());      logger.info("Num test clusters = " + testClusters.size());//    System.out.println("Train clusters: ");//    printClusterList (trainClusters);//    System.out.println("\n\n\nTest clusters: ");//    printClusterList (testClusters);      System.out.println("Creating allclustersegmentation");      Pipe basePipe = makeBasePipe();      AllClusterSegmentation allseg = new AllClusterSegmentation(rawClusters, basePipe);//    System.out.println("ALL SEGMENTATION: ");//    allseg.print ();      //TODO: Add loop, coreferent segmentation aware pipe      Pipe segmentationsPipe = makeSegmentationsPipe (allseg, useClusterFeatures.value,                  useNumClusterOccurences.value, useBogusClusterFeatures.value);      Pipe thePipe = new SerialPipes(new Pipe[]{        makeBasePipe(),        segmentationsPipe,        new TokenSequence2FeatureVectorSequence(),//        new PrintInputAndTarget(),      });      InstanceList training = new InstanceList(thePipe);      training.add(new ClusterListIterator(trainClusters));      InstanceList testing = new InstanceList(thePipe);      testing.add(new ClusterListIterator(testClusters));      logger.info ("Number of training instances = "+training.size ());      logger.info ("Number of testing instances = "+testing.size ());      CRF4 crf = new CRF4(thePipe, null);      crf.setUseSparseWeights(useSparseWeights.value);      switch (markovOrder.value) {        case 0:          crf.addStatesForLabelsConnectedAsIn(training); break;        case 1:          crf.addStatesForHalfLabelsConnectedAsIn(training); break;        case 2:          crf.addStatesForThreeQuarterLabelsConnectedAsIn(training); break;        default:          System.err.println("Unknown markov-order "+markovOrder.value);          System.exit (1);      }      InstanceList nontrivialTest = getNonTrivialTesting (thePipe, testClusters);      TransducerEvaluator eval = new TokenAccuracyEvaluator();      eval.setNumIterationsToWait(10);      eval.setNumIterationsToSkip(5);      crf.train(training, null, testing, eval);      TransducerEvaluator fieldEval = new FieldF1Evaluator(FIELD_NAMES);      fieldEval.test(crf, training, "Training", null);      fieldEval.test(crf, testing, "Testing", null);      fieldEval.test(crf, nontrivialTest, "Clusters>=3", null);      TransducerEvaluator perClassEval = new PerClassAccuracyEvaluator();      perClassEval.test(crf, training, "Training", null);      perClassEval.test(crf, testing, "Testing", null);      perClassEval.test(crf, nontrivialTest, "Clusters>=3", null);      TransducerEvaluator instanceEval = new InstanceAccuracyEvaluator();      instanceEval.test(crf, training, "Training", null);      instanceEval.test(crf, testing, "Testing", null);      instanceEval.test(crf, nontrivialTest, "Clusters>=3", null);      writeOutput(crf, training, "-train-" + rep);      writeOutput(crf, testing, "-test-" + rep);      writeOutput(crf, nontrivialTest, "test-gt-3-" + rep);       writeCrf(crf, "-" + rep);//      saveCrf(crf, "-" + rep);    }    /*    Sequence[] oldPaths = null;    Sequence[] paths = null;    do {      oldPaths = null;      paths = getViterbiPaths ()    } while ((oldPaths == null) || !(pathsEqual (oldPaths, paths)) );      */    /*      CorefIE corefIE = new CorefIE(crfInputFileOption.value,              refNoMeta, clusterNoMeta, startTags, endTags, tagWeight, NumFields);      corefIE.loadCRF();      corefIE.canonicalInstanceMaker();      corefIE.setClusterSizeLimit(clusterSizeLimit.value());      int N = nBestChoice.value();//1;      corefIE.setMethod(methodChoice.value);// 1, 2, 3      System.out.println("N=" + N);      System.out.println("Canonical citation creation method: " + methodChoice.value());      System.out.println("Excluding singletons: " + excludingSingletons.value());      System.out.println("clusterSizeLimit: " + clusterSizeLimit.value());      corefIE.viterbiCRF_SingleFile(ttestFileOption.value, true, SEPERATOR[headOrRefOption.value()], N,              excludingSingletons.value());    */    long timeEnd = System.currentTimeMillis();    double timeElapse = (timeEnd - timeStart) / (1000.000);    System.out.println("Time elapses " + timeElapse + " seconds for testing.");  }  private static InstanceList getNonTrivialTesting (Pipe pipe, InstanceList clusterList)  {    InstanceList goodClusters = new InstanceList (new Alphabet(), new Alphabet());    for (InstanceList.Iterator it = clusterList.iterator(); it.hasNext();) {      Instance instance = (Instance) it.next();      InstanceList cluster = (InstanceList) instance.getData ();      if (cluster.size() > 2) {        goodClusters.add (instance);      }    }    InstanceList goodInstances = new InstanceList (pipe);    goodInstances.add (new ClusterListIterator(goodClusters));    return goodInstances;  }  private static String CAPS = "[A-Z珀蜷駛靅";  private static String ALPHA = "[A-Z珀蜷駛靉-z垞潕棈昡";  private static String ALPHANUM = "[A-Z珀蜷駛靉-z垞潕棈

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -