📄 clusterpapersandvenues.java
字号:
ieInterface1 = new IEInterface(crfFile1); ieInterface1.loadCRF(crfFile1); File crfFile2 = new File(crfInputFile2.value()); ieInterface2 = new IEInterface(crfFile2); ieInterface2.loadCRF(crfFile2); File crfFile3 = new File(crfInputFile3.value()); ieInterface3 = new IEInterface(crfFile3); ieInterface3.loadCRF(crfFile3); File crfFile4 = new File(crfInputFile4.value()); ieInterface4 = new IEInterface(crfFile4); ieInterface4.loadCRF(crfFile4); } else { File crfFile = new File(crfInputFile.value()); ieInterface = new IEInterface(crfFile); ieInterface.loadCRF(crfFile); } } } /** Create citation nodes of type "type" from training files */ private static ArrayList[] createNodesFromTraining (String type) { FileIterator trainFI_1 = null; FileIterator trainFI_2 = null; FileIterator trainFI_3 = null; if (useCRF.value() == true) { trainFI_1 = new FileIterator (trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*"))); if (trainingDir2.value() != null) trainFI_2 = new FileIterator (trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*"))); if (trainingDir3.value() != null) trainFI_3 = new FileIterator (trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*"))); } else { trainFI_1 = new FileIterator (trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*"))); if (trainingDir2.value() != null) trainFI_2 = new FileIterator (trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*"))); if (trainingDir3.value() != null) trainFI_3 = new FileIterator (trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*"))); } ArrayList trainFileArray1 = trainFI_1.getFileArray(); ArrayList pubs1 = new ArrayList(); System.out.println("Number of files 1: " + trainFileArray1.size()); ArrayList nodes1; if (useMultipleCRFs.value() == true) { if (useTreeModel.value()) throw new UnsupportedOperationException ("tree model unsupported"); //nodes1 = CitationUtils.computeNodesWPubs(trainFileArray1, pubs1, ieInterface1, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); else nodes1 = CitationUtils.computeNodes(trainFileArray1, ieInterface1, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); } else { if (useTreeModel.value()) throw new UnsupportedOperationException ("tree model unsupported"); //nodes1 = CitationUtils.computeNodesWPubs(trainFileArray1, pubs1, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); else nodes1 = CitationUtils.computeNodes(trainFileArray1, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); } ArrayList nodes2 = null; ArrayList nodes3 = null; ArrayList pubs2 = null; ArrayList pubs3 = null; if (trainFI_2 != null) { ArrayList trainFileArray2 = trainFI_2.getFileArray(); pubs2 = new ArrayList (); System.out.println("Number of files 2: " + trainFileArray2.size()); if (useMultipleCRFs.value() == true) { if (useTreeModel.value()) throw new UnsupportedOperationException ("tree model unsupported"); //nodes2 = CitationUtils.computeNodesWPubs(trainFileArray2, pubs2, ieInterface2, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); else nodes2 = CitationUtils.computeNodes(trainFileArray2, ieInterface2, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); } else { if (useTreeModel.value()) throw new UnsupportedOperationException ("tree model unsupported"); //nodes2 = CitationUtils.computeNodesWPubs(trainFileArray2, pubs2, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); else nodes2 = CitationUtils.computeNodes(trainFileArray2, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); } } if (trainFI_3 != null) { ArrayList trainFileArray3 = trainFI_3.getFileArray(); pubs3 = new ArrayList(); System.out.println("Number of files 3: " + trainFileArray3.size()); //nodes3 = computeNodesWPubs(trainFileArray3, pubs3, ieInterface3); if (useMultipleCRFs.value() == true) { if (useTreeModel.value()) throw new UnsupportedOperationException ("tree model unsupported"); //nodes3 = CitationUtils.computeNodesWPubs(trainFileArray3, pubs3, ieInterface3, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); else nodes3 = CitationUtils.computeNodes(trainFileArray3, ieInterface3, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); } else { if (useTreeModel.value()) throw new UnsupportedOperationException ("tree model unsupported"); //nodes3 = CitationUtils.computeNodesWPubs(trainFileArray3, pubs3, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); else nodes3 = CitationUtils.computeNodes(trainFileArray3, ieInterface, !trainUsingLabeled.value(), numNBest.value(), nthViterbi.value(), type); } System.out.println(" There are " + nodes3.size() + " training nodes"); } ArrayList[] ret = null; int numLists = 1; if (nodes2 != null) numLists++; if (nodes3 != null) numLists++; if (numLists == 3) ret = new ArrayList[] {nodes1, nodes2, nodes3}; else if (numLists == 2) ret = new ArrayList[] {nodes1, nodes2}; else ret = new ArrayList[] {nodes1}; return ret; } /** Create citation nodes of type "type" from testing files */ private static ArrayList createNodesFromTesting (String type) { FileIterator testFI = null; if (useCRF.value() == true) testFI = new FileIterator (testingDir.value(), new RegexFileFilter(Pattern.compile(".*"))); else testFI = new FileIterator (testingDir.value(), new RegexFileFilter(Pattern.compile(".*"))); ArrayList testFileArray = testFI.getFileArray(); ArrayList testPubList = new ArrayList(); ArrayList test_nodes; if (useMultipleCRFs.value() == true) { test_nodes = CitationUtils.computeNodes(testFileArray,ieInterface4, false, numNBest.value(), nthViterbi.value(), type); } else { if (useTreeModel.value()) throw new UnsupportedOperationException ("tree model unsupported"); //test_nodes = CitationUtils.computeNodesWPubs(testFileArray, testPubList, ieInterface, useCRF.value(), numNBest.value(), nthViterbi.value(), type); else test_nodes = CitationUtils.computeNodes(testFileArray, ieInterface, useCRF.value(), numNBest.value(), nthViterbi.value(),type); } return test_nodes; } private static InstanceList getTrainingList (ArrayList[] nodes, Pipe p) { InstanceList ilist = new InstanceList(); if (loadMEFile.value() == null) { InstanceList ilist1 = CitationUtils.makePairs(p, nodes[0]); ilist.add(ilist1); if (nodes.length > 1) { InstanceList ilist2 = CitationUtils.makePairs(p, nodes[1]); ilist.add(ilist2); } if (nodes.length > 2) { InstanceList ilist3 = CitationUtils.makePairs(p, nodes[2]); ilist.add(ilist3); } } return ilist; } private static Pipe getPaperPipe (AbstractStatisticalTokenDistance distanceMetric, StringDistance triGramDistanceMetric) { Pipe p = new SerialPipes (new Pipe[] { new ExactFieldMatchPipe(Citation.corefFields), new PageMatchPipe(), new YearsWithinFivePipe(), //new FieldStringDistancePipe(new NeedlemanWunsch(), //Citation.corefFields, "EDIST"), //new FieldStringDistancePipe(softtfidf, Citation.corefFields, "softTFIDF"), new FieldStringDistancePipe(triGramDistanceMetric, Citation.corefFields, "trigramTFIDF"), //new PlainFieldPipe (distanceMetric, distanceMetricEditDist), new GlobalPipe(distanceMetric), //new TitlePipe(distanceMetric), new AuthorPipe(distanceMetric), //new VenueClusterPipe(), //new JournalPipe(distanceMetric), //new PagesPipe(distanceMetric), new HeuristicPipe(Citation.corefFields), new InterFieldPipe(), //new HeuristicPipe(Citation.corefFields), //new DatePipe(distanceMetric), //new FuchunPipe(distanceMetricEditDist), new NodePair2FeatureVector (), new Target2Label (), }); return p; } private static Pipe getVenuePipe (AbstractStatisticalTokenDistance distanceMetric, StringDistance triGramDistanceMetric) { Pipe p = new SerialPipes (new Pipe[] { new ExactFieldMatchPipe(Citation.corefFields), new PageMatchPipe(), new YearsWithinFivePipe(), //new FieldStringDistancePipe(new NeedlemanWunsch(), //Citation.corefFields, "EDIST"), //new FieldStringDistancePipe(softtfidf, Citation.corefFields, "softTFIDF"), new FieldStringDistancePipe(triGramDistanceMetric, Citation.corefFields, "trigramTFIDF"), //new PlainFieldPipe (distanceMetric, distanceMetricEditDist), new GlobalPipe(distanceMetric), //new TitlePipe(distanceMetric), new AuthorPipe(distanceMetric), //new JournalPipe(distanceMetric), ///new BooktitlePipe(distanceMetric), new VenuePipe(distanceMetric), new VenueAcronymPipe(), //new PagesPipe(distanceMetric), new HeuristicPipe(Citation.corefFields), new InterFieldPipe(), //new HeuristicPipe(Citation.corefFields), //new DatePipe(distanceMetric), //new FuchunPipe(distanceMetricEditDist), new NodePair2FeatureVector (), new Target2Label (), // new PrintInputAndTarget (), }); return p; } private static TFIDF getDistanceMetric (ArrayList allnodes) { //make distance metrics TFIDF tfidf = new TFIDF(); NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); TFIDF ret = new TFIDF (nGramTokenizer); CitationUtils.makeDistMetric(allnodes, tfidf, ret); return ret; } private static void readCluster (File f) { } protected static void printClustersToFile (Collection citations, String file) { try { BufferedWriter out = new BufferedWriter(new FileWriter(file)); printClustersAsReceived (citations, out); out.close(); } catch (Exception e) {e.printStackTrace();} } protected static void printClustersAsReceived (Collection citations, BufferedWriter out) { int refNum = 1; int clNum = 1; for (Iterator it = citations.iterator(); it.hasNext();) { Collection cl = (Collection)it.next(); for (Iterator i2 = cl.iterator(); i2.hasNext(); ) { Citation c = (Citation)i2.next(); String lab = (String)c.getLabel(); try { out.write("<NEWREFERENCE>\n"); out.write("<meta reference_no=\"" + refNum + "\" cluster_no=\"" + clNum + "\" true_id=\"" + lab + "\"></meta>"); out.write(c.getOrigString()); } catch (Exception e) {} refNum++; } clNum++; } } protected static void printCollectionReferences (Collection collection) { Iterator i1 = collection.iterator(); while (i1.hasNext()) { Iterator i2 = ((Collection)i1.next()).iterator(); while (i2.hasNext()) { Object o = i2.next(); if (o instanceof Node) { Node n = (Node)o; System.out.println("Node: " + n); System.out.println("Node label: " + n.getLabel()); System.out.println("Node index: " + n.getIndex()); } else { System.out.println("Node: " + o); } } } } public static double scoreCitations(List citations) { double score = 0.0; for (Iterator i = citations.iterator(); i.hasNext(); ) { score += (double)((Citation)i.next()).getScore(); } return score/(double)citations.size(); } /* This method will create a collection of collections from the citation nodes */ /* protected static Collection makeCollections (ArrayList nodes) { HashMap map = new HashMap(); // keep an index of node label values to collections Collection collection = new LinkedHashSet(); for (int i=0; i<nodes.size(); i++) { Node n = (Node)nodes.get(i); Object o1 = n.getLabel(); Collection c = (Collection)map.get(o1); if (c != null) { c.add(n); //System.out.println("adding new node " + n + " to existing collection"); } else { Collection newC = new LinkedHashSet(); System.out.println("Creating new collection"); newC.add(n); map.put(o1, newC); } } Iterator i1 = map.values().iterator(); while (i1.hasNext()) { collection.add((Collection)i1.next()); } return collection; }*/ /* protected static List runCanopies(List files) throws Exception { double loose = 0.3; double tight = 0.7; String indexName = "/tmp/index"; Analyzer analyzer = new SimpleAnalyzer(); //Analyzer analyzer = new NGramAnalyzer(); //Analyzer analyzer = new TriGramAnalyzer(); //QueryConstructor queryConstructor = new QueryConstructorSimple(analyzer); QueryConstructor queryConstructor = new QueryConstructorAuthDateTitle(analyzer); IndexFiles.indexFiles(files, indexName, analyzer); CanopyMaker cm = new CanopyMaker(indexName, queryConstructor); cm.setLooseThreshold(loose); cm.setTightThreshold(tight); cm.makeCanopies(); Util.allScores(cm); return Util.getUniquePairsFromSets(Util.convertIds(cm.getCanopies(), cm.getDocIdToDocno())); } */}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -