conditionalclusterer.java

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Java 代码 · 共 452 行 · 第 1/2 页
JAVA
452 行
/* Copyright (C) 2002 Dept. of Computer Science, Univ. of Massachusetts, Amherst   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This program toolkit free software; you can redistribute it and/or   modify it under the terms of the GNU General Public License as   published by the Free Software Foundation; either version 2 of the   License, or (at your option) any later version.   This program is distributed in the hope that it will be useful, but   WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  For more   details see the GNU General Public License and the file README-LEGAL.   You should have received a copy of the GNU General Public License   along with this program; if not, write to the Free Software   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA   02111-1307, USA. *//**	 @author Aron Culotta */package edu.umass.cs.mallet.projects.seg_plus_coref.condclust.cluster;import edu.umass.cs.mallet.projects.seg_plus_coref.coreference.*;import edu.umass.cs.mallet.projects.seg_plus_coref.condclust.types.*;import edu.umass.cs.mallet.base.cluster.*;import edu.umass.cs.mallet.base.types.*;import edu.umass.cs.mallet.base.classify.*;import edu.umass.cs.mallet.base.pipe.*;import java.util.*;/** Trains the conditional clusterer to predict "yes" or "no" for a * NodeClusterPair; i.e. does this nodes belong in this cluster?*/public class ConditionalClusterer {	Classifier classifier;	Pipe pipe;	double threshold;	/**  cache previous similarity calculations to save time */	LRUCache simCache;  	/** measure cache performance */	int cacheAccesses;	int cacheHits;	/** cache clusters already evaluated */	HashMap evalCache;		private static double MIN = Double.NEGATIVE_INFINITY;	private static double MAX = Double.POSITIVE_INFINITY;		public ConditionalClusterer (Pipe _pipe, Classifier _classifier, double _threshold) {		this.pipe = _pipe;		this.classifier = _classifier;		this.threshold = _threshold;		this.simCache = new LRUCache(10000, 100);		this.cacheAccesses = 0;		this.cacheHits = 0;		this.evalCache = new HashMap ();	}	public ConditionalClusterer (Pipe _pipe, Classifier _classifier) {		this (_pipe, _classifier, 0.0);	}	public Classifier getClassifier () {return this.classifier;}		/** Cluster papers and venues jointly. */	public Collection clusterPapersAndVenues (ArrayList _papers, ArrayList _venues,																						Collection paperTrueClustering, Collection venueTrueClustering,																						Classifier paperClusterClassifier, Classifier venueClusterClassifier,																						Random r) {		ArrayList papers = (ArrayList)_papers.clone();		ArrayList venues = (ArrayList)_venues.clone();		Collection paperClustering = new ArrayList ();		Collection venueClustering = new ArrayList ();		HashMap paper2Venue = getPaper2VenueHash (papers, venues);		while (papers.size() > 0) {			int index = r.nextInt (papers.size());			PaperCitation paper =  (PaperCitation) papers.get(index);			VenueCitation venue = (VenueCitation) paper2Venue.get (paper);			if (venue == null) {				placePaperInBestClusterWithoutVenue (paper, paperClustering, venueClustering, paperClusterClassifier,																						 paper2Venue);				papers.remove (index);				continue;			}			Collection closestPaperCluster = null;			Collection closestVenueCluster = null;			double maxVal = this.MIN;			Iterator paperClusterIterator = paperClustering.iterator();			while (paperClusterIterator.hasNext()) {				Collection paperCluster = (Collection) paperClusterIterator.next();				Collection venueCluster = getVenueClusterForPaperCluster (paperCluster, venueClustering, paper2Venue);				if (venueCluster == null) // this paper cluster has no venue					continue;				VenuePaperCluster vpc = new VenuePaperCluster (paper, venue, paperCluster, venueCluster);				Instance inst = new Instance (vpc, "unknown", vpc, classifier.getInstancePipe());				Labeling labeling = classifier.classify (inst).getLabeling();				double val = (labeling.labelAtLocation(0).equals("yes")) ?										 (labeling.valueAtLocation(0) - labeling.valueAtLocation(1)) :										 (labeling.valueAtLocation(1) - labeling.valueAtLocation(0));				if (val > maxVal) {					closestPaperCluster = paperCluster;					closestVenueCluster = venueCluster;					maxVal = val;				}			}			// if classifier says "no" to all cluster pairs, add the paper			// to a new cluster, and the venue to the closest cluster (or			// its own cluster)			if (closestPaperCluster == null || maxVal < threshold) {				createSoloCluster (paper, paperClustering);				placeNodeInClosestCluster (venue, venueClustering, venueClusterClassifier);				papers.remove (index);			}			else {				closestPaperCluster.add (paper);				closestVenueCluster.add (venue);				papers.remove (index);			}		}		paperClustering.addAll (venueClustering);		return paperClustering;	}	private void placePaperInBestClusterWithoutVenue (PaperCitation paper, Collection paperClustering,																										Collection venueClustering,																										Classifier paperClassifier, HashMap paper2Venue) {		Iterator iter = paperClustering.iterator();		Collection closestCluster = null;		double closestValue = -9999999.9;				while (iter.hasNext()) {			Collection cluster = (Collection)iter.next();			if (getVenueClusterForPaperCluster (cluster, venueClustering, paper2Venue) == null)				continue;			double val = getSimilarityToCluster (paper, cluster, paperClassifier);			if (val > closestValue) {				closestValue = val;				closestCluster = cluster;			}		}		if (closestCluster != null && closestValue > threshold) { // add to existing cluster			System.err.println ("Adding node to preexisting cluster with value " + closestValue);			closestCluster.add (paper);		}		else { // create separate cluster			Collection newC = new LinkedHashSet ();			newC.add (paper);			paperClustering.add (newC);		}	}		private void createSoloCluster (Object node, Collection clustering) {		Collection cl = new LinkedHashSet ();		cl.add (node);		clustering.add (cl);	}		private Collection getVenueClusterForPaperCluster (Collection paperCluster, Collection venueClustering,																										 HashMap paper2Venue) {		Iterator iter = paperCluster.iterator();		PaperCitation paper = (PaperCitation) iter.next();		VenueCitation venue = (VenueCitation) paper2Venue.get (paper);		if (venue == null)			return null;		Collection venueCluster = findClusterForNode (venue, venueClustering);		if (venueCluster == null)			throw new IllegalArgumentException ("Expected to find cluster for venue, but didn't: " + venue);		return venueCluster;	}	private Collection findClusterForNode (Object n, Collection clustering) {		Iterator iter = clustering.iterator ();		while (iter.hasNext()) {			Collection c = (Collection) iter.next();			if (c.contains (n))				return c;		}		return null;	}	private HashMap getPaper2VenueHash (ArrayList papers, ArrayList venues) {		HashMap hash = new HashMap ();		for (int i=0; i < papers.size(); i++) {			PaperCitation paper = (PaperCitation) papers.get (i);			Object venue = findVenueForPaper (paper, venues);			if (venue != null)				hash.put (paper, venue);		}		return hash;	}	private Object findVenueForPaper (PaperCitation paper, ArrayList venues) {		String venueID = paper.getField (Citation.venueID);		if (venueID == "")			return null;		for (int i=0; i < venues.size(); i++) {			VenueCitation venue = (VenueCitation) venues.get(i);			String currVenueID = venue.getField (Citation.venueID);			if (venueID.equals (currVenueID))				return venue;		}		throw new IllegalArgumentException ("Can't find venue for paper " + paper);	}			/** Greedily cluster by adding node to Clustering that is (a)	 * closest to an existing cluster, or (b) farthest from all existing	 * clusters (i.e. closest to being a new cluster) */	public Collection cluster (ArrayList _nodes, Collection trueClustering) {		ArrayList nodes = (ArrayList)_nodes.clone();		Collection clustering = new ArrayList ();		this.evalCache = new HashMap ();		this.simCache = new LRUCache (10000, 100);		// initialize with one singleton cluster, randomly chosen for now,		// but could imagine picking "easiest" node (most similar, most		// dissimilar to others?)		Random r =  new Random(1);		int index = r.nextInt (nodes.size());
conditionalclusterer.java - 源码说明

本页面展示了「mallet是自然语言处理、机器学习领域的一个开源项目。」中的 conditionalclusterer.java 源码文件，采用 Java 编程语言编写，共 452 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mallet相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?