📄 bencitationtuinoseg.java
字号:
LineGroupIterator lineI = new LineGroupIterator (reader, Pattern.compile(SEPERATOR[1]), true); while(lineI.hasNext()){ String str = lineI.getLineGroup(); Integer id = new Integer(index++); String label = fileID; // <meta reference_no="10" cluster_no="2"></meta> String start_tag = "<meta"; // intentionally left off the end tag, because of attributes: String end_tag = "</meta>"; String s = SGMLStringOperation.locateField(start_tag, end_tag, str); String[] ss = s.split("\""); if (ss != null && ss.length == 5) { label = ss[3]; label.intern(); id = new Integer(ss[1]); } str = str.substring(str.indexOf(end_tag)+end_tag.length(), str.length()); str = str.intern(); //str = str.toLowerCase(); Citation cit = null; if (useCRFLocal) { cit = new Citation(str, label, id.intValue(), ieInterface, numNBest.value(), nthViterbi.value()); } else { cit = new Citation(str, label, id.intValue()); } nodes.add(cit); Publication p = (Publication)hMap.get(label); // look up publication that this // belongs to if (p != null) { p.addNewCitation (cit); // add citation to publication } else { p = new Publication (cit); // create new publication with this citation hMap.put(label, p); // add publication to hash map publications.add(p); } lineI.nextLineGroup(); } } long timeEnd = System.currentTimeMillis(); double timeElapse = (timeEnd - timeStart)/(1000.000); System.out.println("Time elapses " + timeElapse + " seconds for computing nodes."); return nodes; } public static StringDistance computeDistanceMetric (ArrayList nodes) { ArrayList allStrings = new ArrayList(); StringDistance tfidf = new TFIDF (); for (int i=0; i < nodes.size(); i++) { //Citation c = (Citation)((Node)nodes.get(i)).getObject(); Citation c = (Citation)nodes.get(i); allStrings.addAll(c.getAllStringsWrapped()); } tfidf.accumulateStatistics(allStrings.iterator()); return tfidf; //return new SoftTFIDF(tfidf); } private static void makeDistMetric(List list) { List nodes = new ArrayList(); Iterator iter = list.iterator(); while (iter.hasNext()) { nodes.add((Citation)iter.next()); } NGramTokenizer nGramTokenizer = new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true)); ArrayList allStrings = new ArrayList(); tfidf = new TFIDF (); //softtfidf = new SoftTFIDF(new JaroWinkler(), 0.8); triGramDistanceMetric = new TFIDF(nGramTokenizer); for (int i=0; i < nodes.size(); i++) { Citation c = (Citation)nodes.get(i); allStrings.addAll(c.getAllStringsWrapped()); } tfidf.accumulateStatistics(allStrings.iterator()); triGramDistanceMetric.accumulateStatistics(allStrings.iterator()); //softtfidf.accumulateStatistics(allStrings.iterator()); } public static void main (String[] args) throws FileNotFoundException { commandOptions.process (args); String[] startTags = new String[]{"<author>"}; String[] endTags = new String[]{"</author>"}; boolean oldCluster = false; boolean newCluster = true;//true; if (useCRF.value() == true) { if (useMultipleCRFs.value() == true) { System.out.println("Initializing CRF"); File crfFile1 = new File(crfInputFile1.value()); ieInterface1 = new IEInterface(crfFile1); ieInterface1.loadCRF(crfFile1); File crfFile2 = new File(crfInputFile2.value()); ieInterface2 = new IEInterface(crfFile2); ieInterface2.loadCRF(crfFile2); File crfFile3 = new File(crfInputFile3.value()); ieInterface3 = new IEInterface(crfFile3); ieInterface3.loadCRF(crfFile3); File crfFile4 = new File(crfInputFile4.value()); ieInterface4 = new IEInterface(crfFile4); ieInterface4.loadCRF(crfFile4); } else { File crfFile = new File(crfInputFile.value()); ieInterface = new IEInterface(crfFile); ieInterface.loadCRF(crfFile); } } if (useNBest.value() == true) { System.out.println("Using n-best CRF"); } FileIterator trainFI_1 = null; FileIterator trainFI_2 = null; FileIterator trainFI_3 = null; if (useCRF.value() == true) { trainFI_1 = new FileIterator (trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*"))); if (trainingDir2.value() != null) trainFI_2 = new FileIterator (trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*"))); if (trainingDir3.value() != null) trainFI_3 = new FileIterator (trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*"))); } else { trainFI_1 = new FileIterator (trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*"))); if (trainingDir2.value() != null) trainFI_2 = new FileIterator (trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*"))); if (trainingDir3.value() != null) trainFI_3 = new FileIterator (trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*"))); } ArrayList trainFileArray1 = trainFI_1.getFileArray(); ArrayList pubs1 = new ArrayList(); System.out.println("Number of files 1: " + trainFileArray1.size()); //ArrayList nodes1 = computeNodesWPubs(trainFileArray1, pubs1, //ieInterface1); ArrayList nodes1; if (useMultipleCRFs.value() == true) { if (useTreeModel.value()) nodes1 = computeNodesWPubs(trainFileArray1, pubs1, ieInterface1, !trainUsingLabeled.value()); else nodes1 = computeNodes(trainFileArray1, ieInterface1, !trainUsingLabeled.value()); } else { if (useTreeModel.value()) nodes1 = computeNodesWPubs(trainFileArray1, pubs1, ieInterface, !trainUsingLabeled.value()); else nodes1 = computeNodes(trainFileArray1, ieInterface, !trainUsingLabeled.value()); } ArrayList nodes2 = null; ArrayList nodes3 = null; ArrayList pubs2 = null; ArrayList pubs3 = null; if (trainFI_2 != null) { ArrayList trainFileArray2 = trainFI_2.getFileArray(); pubs2 = new ArrayList (); System.out.println("Number of files 2: " + trainFileArray2.size()); if (useMultipleCRFs.value() == true) { if (useTreeModel.value()) nodes2 = computeNodesWPubs(trainFileArray2, pubs2, ieInterface2, !trainUsingLabeled.value()); else nodes2 = computeNodes(trainFileArray2, ieInterface2, !trainUsingLabeled.value()); } else { if (useTreeModel.value()) nodes2 = computeNodesWPubs(trainFileArray2, pubs2, ieInterface, !trainUsingLabeled.value()); else nodes2 = computeNodes(trainFileArray2, ieInterface, !trainUsingLabeled.value()); } } if (trainFI_3 != null) { ArrayList trainFileArray3 = trainFI_3.getFileArray(); pubs3 = new ArrayList(); System.out.println("Number of files 3: " + trainFileArray3.size()); //nodes3 = computeNodesWPubs(trainFileArray3, pubs3, ieInterface3); if (useMultipleCRFs.value() == true) { if (useTreeModel.value()) nodes3 = computeNodesWPubs(trainFileArray3, pubs3, ieInterface3, !trainUsingLabeled.value()); else nodes3 = computeNodes(trainFileArray3, ieInterface3, !trainUsingLabeled.value()); } else { if (useTreeModel.value()) nodes3 = computeNodesWPubs(trainFileArray3, pubs3, ieInterface, !trainUsingLabeled.value()); else nodes3 = computeNodes(trainFileArray3, ieInterface, !trainUsingLabeled.value()); } System.out.println(" There are " + nodes3.size() + " training nodes"); } FileIterator testFI = null; if (useCRF.value() == true) testFI = new FileIterator (testingDir.value(), new RegexFileFilter(Pattern.compile(".*"))); else testFI = new FileIterator (testingDir.value(), new RegexFileFilter(Pattern.compile(".*"))); ArrayList testFileArray = testFI.getFileArray(); ArrayList testPubList = new ArrayList(); ArrayList test_nodes; if (useMultipleCRFs.value() == true) { test_nodes = computeNodes(testFileArray,ieInterface4); } else { if (useTreeModel.value()) test_nodes = computeNodesWPubs(testFileArray, testPubList, ieInterface, useCRF.value()); else test_nodes = computeNodes(testFileArray, ieInterface, useCRF.value()); } //double testingCRFscore = scoreCitations (test_nodes); //ArrayList test_nodes = computeNodesWPubs(testFileArray, testPubList, ieInterface4); ArrayList allnodes = new ArrayList(); // all nodes, both training and // test allnodes.addAll(nodes1); if (nodes2 != null) allnodes.addAll(nodes2); if (nodes3 != null) allnodes.addAll(nodes3); //allnodes.addAll(test_nodes); //double trainingCRFscore = scoreCitations(allnodes); //System.out.println("CRF Score for Training Citations: " + trainingCRFscore); //System.out.println("CRF Score for Testing Citations: " + testingCRFscore); //make distance metrics makeDistMetric(allnodes); System.out.println("finished computing nodes, about to compute distanceMetric params "); // compute the string distance using SecondString utilities // this will serve as a useful feature // Possible extension (later): build different string metrics for // different fields - this will then be an array of them AbstractStatisticalTokenDistance distanceMetric = (AbstractStatisticalTokenDistance)computeDistanceMetric (allnodes); Pipe instancePipe = new SerialPipes (new Pipe[] { //new ExactFieldMatchPipe(Citation.corefFields), //new PageMatchPipe(), //new YearsWithinFivePipe(), //new FieldStringDistancePipe(new NeedlemanWunsch(), //Citation.corefFields, "EDIST"), //new FieldStringDistancePipe(softtfidf, Citation.corefFields, "softTFIDF"), //new FieldStringDistancePipe(triGramDistanceMetric, Citation.corefFields, "trigramTFIDF"), //new PlainFieldPipe (distanceMetric, distanceMetricEditDist), new GlobalPipe(distanceMetric), //new TitlePipe(distanceMetric), //new AuthorPipe(distanceMetric), //new JournalPipe(distanceMetric), //new PagesPipe(distanceMetric), //new HeuristicPipe(Citation.corefFields), //new InterFieldPipe(), //new HeuristicPipe(Citation.corefFields), //new DatePipe(distanceMetric), //new FuchunPipe(distanceMetricEditDist), new NodePair2FeatureVector (), new Target2Label (), }); InstanceList ilist = new InstanceList(); if (loadMEFile.value() == null) { InstanceList ilist1 = makePairs(instancePipe, nodes1); ilist.add(ilist1); if (nodes2 != null) { InstanceList ilist2 = makePairs(instancePipe, nodes2); ilist.add(ilist2); } if (nodes3 != null) { InstanceList ilist3 = makePairs(instancePipe, nodes3); ilist.add(ilist3); } } FeatureInducer fi = null; // try doing some feature induction now if (useFeatureInduction.value()) { RankedFeatureVector.Factory gainFactory = null; gainFactory = new InfoGain.Factory(); fi = new FeatureInducer (gainFactory, ilist, 10); fi.induceFeaturesFor(ilist, false, false); } TreeModel tmodel = null; if (useTreeModel.value()) { if (pubs2 != null && pubs3 != null) { tmodel = new TreeModel(instancePipe, nodes1, nodes2, nodes3, pubs1, pubs3, pubs3); } else { tmodel = new TreeModel(instancePipe, nodes1, pubs1); } //tmodel.setMultiTree (true); } //List pairsFromCanopy = Util.readPairsFromFile("/tmp/pairs"); //InstanceList ilistToCluster = makePairs(instancePipe, nodes, pairsFromCanopy); InstanceList itestlist = makePairs(instancePipe, test_nodes);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -