⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bencitationtuinoseg.java

📁 这是一个matlab的java实现。里面有许多内容。请大家慢慢捉摸。
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
	    LineGroupIterator lineI = new LineGroupIterator (reader, Pattern.compile(SEPERATOR[1]), true);	    while(lineI.hasNext()){				String str = lineI.getLineGroup();				Integer id = new Integer(index++);				String label = fileID;				// <meta reference_no="10" cluster_no="2"></meta>				String start_tag = "<meta"; // intentionally left off the end tag, because of attributes:				String end_tag   = "</meta>";				String s = SGMLStringOperation.locateField(start_tag, end_tag, str);				String[] ss = s.split("\"");				if (ss != null && ss.length == 5) {					label = ss[3];					label.intern();					id = new Integer(ss[1]);				}				str = str.substring(str.indexOf(end_tag)+end_tag.length(), str.length());				str = str.intern();				//str = str.toLowerCase();				Citation cit = null;				if (useCRFLocal) {					cit = new Citation(str, label, id.intValue(), ieInterface,														 numNBest.value(), nthViterbi.value());				} else {					cit = new Citation(str, label, id.intValue());				}				nodes.add(cit);				Publication p = (Publication)hMap.get(label); // look up publication that this				// belongs to				if (p != null) { 					p.addNewCitation (cit);  // add citation to publication				} else {					p = new Publication (cit); // create new publication with this citation					hMap.put(label, p); // add publication to hash map					publications.add(p);				}								lineI.nextLineGroup();	    }		}		long timeEnd = System.currentTimeMillis();		double timeElapse = (timeEnd - timeStart)/(1000.000);		System.out.println("Time elapses " + timeElapse + " seconds for computing nodes.");		return nodes;	}	public static StringDistance computeDistanceMetric (ArrayList nodes) {		ArrayList allStrings = new ArrayList();		StringDistance tfidf = new TFIDF ();				for (int i=0; i < nodes.size(); i++) {	    //Citation c = (Citation)((Node)nodes.get(i)).getObject();	    Citation c = (Citation)nodes.get(i);	    allStrings.addAll(c.getAllStringsWrapped());		}		tfidf.accumulateStatistics(allStrings.iterator());		return tfidf;		//return new SoftTFIDF(tfidf);	}	private static void makeDistMetric(List list) {		List nodes = new ArrayList();		Iterator iter = list.iterator();		while (iter.hasNext()) {			nodes.add((Citation)iter.next());		}		NGramTokenizer nGramTokenizer =	    new NGramTokenizer(3,3,false, new SimpleTokenizer(true, true));		ArrayList allStrings = new ArrayList();		tfidf = new TFIDF ();		//softtfidf = new SoftTFIDF(new JaroWinkler(), 0.8);		triGramDistanceMetric = new TFIDF(nGramTokenizer);		for (int i=0; i < nodes.size(); i++) {			Citation c = (Citation)nodes.get(i);			allStrings.addAll(c.getAllStringsWrapped());		}		tfidf.accumulateStatistics(allStrings.iterator());		triGramDistanceMetric.accumulateStatistics(allStrings.iterator());		//softtfidf.accumulateStatistics(allStrings.iterator());	}	public static void main (String[] args) throws FileNotFoundException	{		commandOptions.process (args);		String[] startTags = new String[]{"<author>"};		String[] endTags = new String[]{"</author>"};		boolean oldCluster = false;		boolean newCluster = true;//true;		if (useCRF.value() == true) {	    if (useMultipleCRFs.value() == true) {				System.out.println("Initializing CRF");				File crfFile1 = new File(crfInputFile1.value());				ieInterface1 = new IEInterface(crfFile1);				ieInterface1.loadCRF(crfFile1);				File crfFile2 = new File(crfInputFile2.value());				ieInterface2 = new IEInterface(crfFile2);				ieInterface2.loadCRF(crfFile2);				File crfFile3 = new File(crfInputFile3.value());				ieInterface3 = new IEInterface(crfFile3);				ieInterface3.loadCRF(crfFile3);				File crfFile4 = new File(crfInputFile4.value());				ieInterface4 = new IEInterface(crfFile4);				ieInterface4.loadCRF(crfFile4);	    } else {				File crfFile = new File(crfInputFile.value());				ieInterface = new IEInterface(crfFile);				ieInterface.loadCRF(crfFile);	    }		}		if (useNBest.value() == true) {	    System.out.println("Using n-best CRF");		}		FileIterator trainFI_1 = null;		FileIterator trainFI_2 = null;		FileIterator trainFI_3 = null;		if (useCRF.value() == true) {	    trainFI_1 = new FileIterator (trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*")));	    if (trainingDir2.value() != null)				trainFI_2 = new FileIterator (trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*")));	    if (trainingDir3.value() != null)				trainFI_3 = new FileIterator (trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*")));					}		else {	    trainFI_1 = new FileIterator (trainingDir1.value(), new RegexFileFilter(Pattern.compile(".*")));	    if (trainingDir2.value() != null)				trainFI_2 = new FileIterator (trainingDir2.value(), new RegexFileFilter(Pattern.compile(".*")));	    if (trainingDir3.value() != null)				trainFI_3 = new FileIterator (trainingDir3.value(), new RegexFileFilter(Pattern.compile(".*")));					}				ArrayList trainFileArray1 = trainFI_1.getFileArray();		ArrayList pubs1 = new ArrayList();		System.out.println("Number of files 1: " + trainFileArray1.size());		//ArrayList nodes1 = computeNodesWPubs(trainFileArray1, pubs1,		//ieInterface1);		ArrayList nodes1;		if (useMultipleCRFs.value() == true) {			if (useTreeModel.value())				nodes1 = computeNodesWPubs(trainFileArray1, pubs1, ieInterface1, !trainUsingLabeled.value());			else				nodes1 = computeNodes(trainFileArray1, ieInterface1, !trainUsingLabeled.value());		}																																else {			if (useTreeModel.value())				nodes1 = computeNodesWPubs(trainFileArray1, pubs1, ieInterface, !trainUsingLabeled.value());			else				nodes1 = computeNodes(trainFileArray1, ieInterface, !trainUsingLabeled.value());		}		ArrayList nodes2 = null;		ArrayList nodes3 = null;		ArrayList pubs2 = null;		ArrayList pubs3 = null;		if (trainFI_2 != null) {	    ArrayList trainFileArray2 = trainFI_2.getFileArray();	    pubs2 = new ArrayList ();	    System.out.println("Number of files 2: " + trainFileArray2.size());	    if (useMultipleCRFs.value() == true) {				if (useTreeModel.value()) 					nodes2 = computeNodesWPubs(trainFileArray2, pubs2, ieInterface2, !trainUsingLabeled.value());				else					nodes2 = computeNodes(trainFileArray2, ieInterface2, !trainUsingLabeled.value());			}			else {				if (useTreeModel.value())					nodes2 = computeNodesWPubs(trainFileArray2, pubs2, ieInterface, !trainUsingLabeled.value());				else					nodes2 = computeNodes(trainFileArray2, ieInterface, !trainUsingLabeled.value());			}		}		if (trainFI_3 != null) {	    ArrayList trainFileArray3 = trainFI_3.getFileArray();	    pubs3 = new ArrayList();	    System.out.println("Number of files 3: " + trainFileArray3.size());	    //nodes3 = computeNodesWPubs(trainFileArray3, pubs3, ieInterface3);	    if (useMultipleCRFs.value() == true) {				if (useTreeModel.value())					nodes3 = computeNodesWPubs(trainFileArray3, pubs3, ieInterface3, !trainUsingLabeled.value());				else					nodes3 = computeNodes(trainFileArray3, ieInterface3, !trainUsingLabeled.value());			}			else {				if (useTreeModel.value())					nodes3 = computeNodesWPubs(trainFileArray3, pubs3, ieInterface, !trainUsingLabeled.value());				else					nodes3 = computeNodes(trainFileArray3, ieInterface, !trainUsingLabeled.value());			}	    System.out.println(" There are " + nodes3.size() + " training nodes");		}		FileIterator testFI = null;		if (useCRF.value() == true)	    testFI = new FileIterator (testingDir.value(), new RegexFileFilter(Pattern.compile(".*")));		else				testFI = new FileIterator (testingDir.value(), new RegexFileFilter(Pattern.compile(".*")));				ArrayList testFileArray = testFI.getFileArray();		ArrayList testPubList = new ArrayList();		ArrayList test_nodes;		if (useMultipleCRFs.value() == true) {	    test_nodes = computeNodes(testFileArray,ieInterface4);		}		else {			if (useTreeModel.value())				test_nodes = computeNodesWPubs(testFileArray, testPubList, ieInterface, useCRF.value());			else				test_nodes = computeNodes(testFileArray, ieInterface, useCRF.value());		}		//double testingCRFscore = scoreCitations (test_nodes);		//ArrayList test_nodes = computeNodesWPubs(testFileArray, testPubList, ieInterface4);		ArrayList allnodes = new ArrayList();  // all nodes, both training and		// test				allnodes.addAll(nodes1);		if (nodes2 != null)	    allnodes.addAll(nodes2);		if (nodes3 != null)	    allnodes.addAll(nodes3);		//allnodes.addAll(test_nodes);		//double trainingCRFscore = scoreCitations(allnodes);		//System.out.println("CRF Score for Training Citations: " + trainingCRFscore);		//System.out.println("CRF Score for Testing Citations: " + testingCRFscore);		//make distance metrics		makeDistMetric(allnodes);				System.out.println("finished computing nodes, about to compute distanceMetric params ");		// compute the string distance using SecondString utilities		// this will serve as a useful feature		// Possible extension (later): build different string metrics for		// different fields - this will then be an array of them		AbstractStatisticalTokenDistance distanceMetric =	    (AbstractStatisticalTokenDistance)computeDistanceMetric (allnodes);				Pipe instancePipe = new SerialPipes (new Pipe[] {		    //new ExactFieldMatchPipe(Citation.corefFields),		    //new PageMatchPipe(),		    //new YearsWithinFivePipe(),	    //new FieldStringDistancePipe(new NeedlemanWunsch(),	    //Citation.corefFields, "EDIST"),	    //new FieldStringDistancePipe(softtfidf, Citation.corefFields, "softTFIDF"),	    //new FieldStringDistancePipe(triGramDistanceMetric, Citation.corefFields, "trigramTFIDF"),				    //new PlainFieldPipe (distanceMetric, distanceMetricEditDist),	    new GlobalPipe(distanceMetric),	    //new TitlePipe(distanceMetric),	    //new AuthorPipe(distanceMetric),	    //new JournalPipe(distanceMetric),	    //new PagesPipe(distanceMetric),			//new HeuristicPipe(Citation.corefFields),		    //new InterFieldPipe(),			//new HeuristicPipe(Citation.corefFields),	    //new DatePipe(distanceMetric),	    //new FuchunPipe(distanceMetricEditDist),				    new NodePair2FeatureVector (),	    new Target2Label (),		});		InstanceList ilist = new InstanceList();		if (loadMEFile.value() == null) {			InstanceList ilist1 = makePairs(instancePipe, nodes1);			ilist.add(ilist1);			if (nodes2 != null) {				InstanceList ilist2 = makePairs(instancePipe, nodes2);				ilist.add(ilist2);			}			if (nodes3 != null) {				InstanceList ilist3 = makePairs(instancePipe, nodes3);				ilist.add(ilist3);			}		}		FeatureInducer fi = null;		// try doing some feature induction now		if (useFeatureInduction.value()) {	    RankedFeatureVector.Factory gainFactory = null;	    gainFactory = new InfoGain.Factory();	    fi = new FeatureInducer (gainFactory,															 ilist, 10);	    fi.induceFeaturesFor(ilist, false, false);		}		TreeModel tmodel = null;		if (useTreeModel.value()) {			if (pubs2 != null && pubs3 != null) {				tmodel = new TreeModel(instancePipe, nodes1, nodes2, nodes3,															 pubs1, pubs3, pubs3);			}			else {				tmodel = new TreeModel(instancePipe, nodes1, pubs1);			}			//tmodel.setMultiTree (true);		}		//List pairsFromCanopy = Util.readPairsFromFile("/tmp/pairs");		//InstanceList ilistToCluster = makePairs(instancePipe, nodes, pairsFromCanopy);		InstanceList itestlist = makePairs(instancePipe, test_nodes);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -