📄 bencitationtuinoseg.java
字号:
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).http://www.cs.umass.edu/~mccallum/malletThis software is provided under the terms of the Common Public License,version 1.0, as published by http://www.opensource.org. For furtherinformation, see the file `LICENSE' included with this distribution. */package edu.umass.cs.mallet.projects.seg_plus_coref.coreference;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.fst.*;import edu.umass.cs.mallet.base.util.*;import edu.umass.cs.mallet.base.pipe.*;import edu.umass.cs.mallet.base.pipe.iterator.*;import edu.umass.cs.mallet.base.pipe.tsf.*;import java.io.*;import java.util.*;import java.util.regex.*;import java.lang.reflect.Array;import com.wcohen.secondstring.*;import com.wcohen.secondstring.tokens.NGramTokenizer;import com.wcohen.secondstring.tokens.SimpleTokenizer;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.ExactFieldMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.PageMatchPipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.YearsWithinFivePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.FieldStringDistancePipe;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.*;import edu.umass.cs.mallet.projects.seg_plus_coref.clustering.*;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEInterface;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.IEEvaluator;import edu.umass.cs.mallet.projects.seg_plus_coref.ie.CRFIO;//import edu.umass.cs.mallet.users.hay.canopy.Util;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructor;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructorSimple;//import edu.umass.cs.mallet.users.hay.canopy.IndexFiles;//import edu.umass.cs.mallet.users.hay.canopy.CanopyMaker;//import edu.umass.cs.mallet.users.hay.canopy.QueryConstructorAuthDateTitle;//import salvo.jesus.graph.WeightedGraph;//import org.apache.lucene.analysis.Analyzer;//import org.apache.lucene.analysis.SimpleAnalyzer;public class BenCitationTUINoSeg{ private static String[] SEPERATOR = new String[] {"<NEW_HEADER>", "<NEWREFERENCE>"}; private static CRF crf = null; private static Pipe pipe; private static IEInterface ieInterface; private static IEInterface ieInterface1; private static IEInterface ieInterface2; private static IEInterface ieInterface3; private static IEInterface ieInterface4; private static StringDistance softtfidf; private static StringDistance tfidf; private static Jaccard distanceMetricEditDist; private static StringDistance triGramDistanceMetric; static CommandOption.Boolean fullPartition = new CommandOption.Boolean (TUI.class, "full-partition", "FILENAME", false, false, "Use full partitioninig", null); static CommandOption.Boolean useWeightedAvg = new CommandOption.Boolean (TUI.class, "use-weighted-avg", "FILENAME", false, false, "Use weighted average", null); static CommandOption.String loadMEFile = new CommandOption.String (TUI.class, "load-me-file", "FILENAME", true, null, "The name of the MaxEnt model file.", null); static CommandOption.String outputFile = new CommandOption.String (TUI.class, "output-file", "FILENAME", true, null, "The name of the file where output clusters will be printed to.", null); static CommandOption.String crfInputFile = new CommandOption.String (TUI.class, "crf-input-file", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.String crfInputFile1 = new CommandOption.String (TUI.class, "crf-input-file-1", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.String crfInputFile2 = new CommandOption.String (TUI.class, "crf-input-file-2", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.String crfInputFile3 = new CommandOption.String (TUI.class, "crf-input-file-3", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.String crfInputFile4 = new CommandOption.String (TUI.class, "crf-input-file-4", "FILENAME", true, null, "The name of the file to read the trained CRF for testing.", null); static CommandOption.Boolean useCRF = new CommandOption.Boolean (TUI.class, "use-crf", "BOOL", false, false, "Use CRF or not.", null); static CommandOption.Boolean useMultipleCRFs = new CommandOption.Boolean (TUI.class, "use-multiple-crfs", "BOOL", false, false, "Use a separate crf for each data segment or not.", null); static CommandOption.Boolean useTreeModel = new CommandOption.Boolean (TUI.class, "use-tree-model", "BOOL", false, false, "Use and train tree model.", null); static CommandOption.Boolean useCorrelational = new CommandOption.Boolean (TUI.class, "use-correlational", "BOOL", false, false, "Use Correlational Clustering or not, if not uses Greedy.", null); static CommandOption.Boolean useFeatureInduction = new CommandOption.Boolean (TUI.class, "use-feature-induction", "BOOL", false, false, "Use Feature Induction or Not.", null); static CommandOption.Boolean useNBest = new CommandOption.Boolean (TUI.class, "use-n-best", "BOOL", false, false, "Use NBest or not.", null); static CommandOption.Boolean useTrueNumClusters = new CommandOption.Boolean (TUI.class, "use-true-num-clusters", "BOOL", false, false, "Use NBest or not.", null); static CommandOption.Boolean useOptimal = new CommandOption.Boolean (TUI.class, "use-optimal", "BOOL", false, false, "Use NBest or not.", null); static CommandOption.Integer optimalNBest = new CommandOption.Integer (TUI.class, "optimal-n-best", "INTEGER", true, -1, "Size of n, for searching for optimal n-best configuration.", null); static CommandOption.Integer rBeamSize = new CommandOption.Integer (TUI.class, "r-beam-size", "INTEGER", true, 10, "Size of n, for searching for optimal n-best configuration.", null); static CommandOption.String trainingDir1 = new CommandOption.String (TUI.class, "training-dir-1", "FILENAME", true, null, "Directory containing training files.", null); static CommandOption.String trainingDir2 = new CommandOption.String (TUI.class, "training-dir-2", "FILENAME", true, null, "Directory containing training files.", null); static CommandOption.String trainingDir3 = new CommandOption.String (TUI.class, "training-dir-3", "FILENAME", true, null, "Directory containing training files.", null); static CommandOption.String testingDir = new CommandOption.String (TUI.class, "testing-dir", "FILENAME", true, null, "Directory containing testing files.", null); static CommandOption.Integer searchIters = new CommandOption.Integer (TUI.class, "search-iters", "INTEGER", true, 3, "Number of search iterations.", null); static CommandOption.Integer searchReductions = new CommandOption.Integer (TUI.class, "search-reductions", "INTEGER", true, 5, "Number of search reductions.", null); static CommandOption.Integer numNBest = new CommandOption.Integer (TUI.class, "num-n-best", "INTEGER", true, 3, "Number of n-best candidates to store.", null); static CommandOption.Integer nthViterbi = new CommandOption.Integer (TUI.class, "nth-viterbi", "INTEGER", true, 0, "Number of n-best candidates to use .", null); static CommandOption.Boolean trainUsingLabeled = new CommandOption.Boolean (TUI.class, "train-using-labeled", "BOOL", true, false, "Train just using the labeled data, but test on CRF output", null); static final CommandOption.List commandOptions = new CommandOption.List ( "Training, testing and running information extraction on paper header or reference.", new CommandOption[] { useWeightedAvg, trainUsingLabeled, rBeamSize, loadMEFile, useTreeModel, fullPartition, outputFile, useOptimal, crfInputFile, crfInputFile1, crfInputFile2, crfInputFile3, crfInputFile4, useCRF, useMultipleCRFs, useFeatureInduction, useCorrelational, useNBest, optimalNBest, useTrueNumClusters, trainingDir1, trainingDir2, trainingDir3, testingDir, searchIters, searchReductions, numNBest, nthViterbi }); /* // this method simply places each node (citation) in a publication object // this needs to be reworked when we consider how Publications and Citations // actually interact - i.e. are Publications nodes in the graph - or just ciations protected static ArrayList computePublications (ArrayList nodes) { ArrayList pubs = new ArrayList(); for (int i=0; i<nodes.size(); i++) { pubs.add(new Publication ((Node)nodes.get(i))); } return pubs; }*/ protected static ArrayList computeNodes(ArrayList trainFileArray, IEInterface ieInterface) { return computeNodes(trainFileArray, ieInterface, false); } protected static ArrayList computeNodes(ArrayList trainFileArray, IEInterface ieInterface, boolean useCRFLocal) { System.out.println("Computing nodes..."); long timeStart = System.currentTimeMillis(); Reader reader; ArrayList nodes = new ArrayList(); HashMap hMap = new HashMap(); // keys are cluster IDs, values are publications int index = 0; for(int i=0; i<trainFileArray.size(); i++){ File file = (File)trainFileArray.get(i); String fileID = file.toString(); System.out.println(i + ": " + fileID ); try { reader = new FileReader (file); } catch (Exception e) { throw new IllegalArgumentException ("Can't read file "+file); } LineGroupIterator lineI = new LineGroupIterator (reader, Pattern.compile(SEPERATOR[1]), true); while(lineI.hasNext()){ String str = lineI.getLineGroup(); Integer id = new Integer(index++); String label = fileID; // <meta reference_no="10" cluster_no="2"></meta> String start_tag = "<meta"; // intentionally left off the end tag, because of attributes: String end_tag = "</meta>"; String s = SGMLStringOperation.locateField(start_tag, end_tag, str); String[] ss = s.split("\""); if (ss != null && ss.length == 5) { label = ss[3]; label.intern(); id = new Integer(ss[1]); } str = str.substring(str.indexOf(end_tag)+end_tag.length(), str.length()); str = str.intern(); //str = str.toLowerCase(); //nodes.add(new Node(new Citation(str, label, id.intValue()))); if (useCRFLocal) { nodes.add(new Citation(str, label, id.intValue(), ieInterface, numNBest.value(), nthViterbi.value())); } else { nodes.add(new Citation(str, label, id.intValue())); } //System.out.println("X" + str); //System.out.println("X" + label); //System.out.println("X" + id); lineI.nextLineGroup(); } } long timeEnd = System.currentTimeMillis(); double timeElapse = (timeEnd - timeStart)/(1000.000); System.out.println("Time elapses " + timeElapse + " seconds for computing nodes."); return nodes; } protected static ArrayList computeNodesWPubs(ArrayList trainFileArray, ArrayList publications, IEInterface ieInterface) { return computeNodesWPubs(trainFileArray, publications, ieInterface, false); } protected static ArrayList computeNodesWPubs(ArrayList trainFileArray, ArrayList publications, IEInterface ieInterface, boolean useCRFLocal) { System.out.println("Computing nodes..."); long timeStart = System.currentTimeMillis(); Reader reader; ArrayList nodes = new ArrayList(); HashMap hMap = new HashMap(); // keys are cluster IDs, values are publications int index = 0; for(int i=0; i<trainFileArray.size(); i++){ File file = (File)trainFileArray.get(i); String fileID = file.toString(); System.out.println(i + ": " + fileID ); try { reader = new FileReader (file); } catch (Exception e) { throw new IllegalArgumentException ("Can't read file "+file); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -