📄 mpckmeans.java
字号:
calculateObjectiveFunction(true); System.out.println((float)m_Objective + " - Objective function after metric update"); System.out.println("\tvar=" + ((float)m_objVariance) + "\tC=" + ((float)m_objCannotLinks) + "\tM=" + ((float)m_objMustLinks) + "\tLOG=" + ((float)m_objNormalizer) + "\tREG=" + ((float)m_objRegularizer)); } if (m_ConstraintsHash.size() > 0) { m_maxCLPenalties = calculateMaxCLPenalties(); } } if (fincoh != null) { printConstraintIncoherence(fincoh); } converged = convergenceCheck(m_OldObjective, m_Objective); m_Iterations++; } if (fincoh != null) { fincoh.close(); } System.out.println("Converged!"); System.err.print("Its\t" + m_Iterations + "\t"); if (m_verbose) { System.out.println("Done clustering; top cluster features: "); for (int i = 0; i < m_NumClusters; i++){ System.out.println("Centroid " + i); TreeMap map = new TreeMap(Collections.reverseOrder()); Instance centroid= m_ClusterCentroids.instance(i); for (int j = 0; j < centroid.numValues(); j++) { Attribute attr = centroid.attributeSparse(j); map.put(new Double(centroid.value(attr)), attr.name()); } Iterator it = map.entrySet().iterator(); for (int j=0; j < 5 && it.hasNext(); j++) { Map.Entry entry = (Map.Entry) it.next(); System.out.println("\t" + entry.getKey() + "\t" + entry.getValue()); } } } } public void printConstraintIncoherence(PrintStream fincoh) throws Exception { Object[] array = m_ConstraintsHash.entrySet().toArray(); int numML = 0, numCL = 0; double incoh = 0; m_numViolations = 0; System.out.println("NumConstraints: " + array.length); for (int i=0; i < array.length; i++) { Map.Entry con1 = (Map.Entry) array[i]; InstancePair pair1 = (InstancePair) con1.getKey(); int link1 = ((Integer) con1.getValue()).intValue(); double dist1 = m_metric.distance(m_Instances.instance(pair1.first), m_Instances.instance(pair1.second)); if (link1 == InstancePair.MUST_LINK) { numML++; } else if (link1 == InstancePair.CANNOT_LINK) { numCL++; } for (int j=i+1; j < array.length; j++) { Map.Entry con2 = (Map.Entry) array[j]; InstancePair pair2 = (InstancePair) con2.getKey(); int link2 = ((Integer) con2.getValue()).intValue(); double dist2 = m_metric.distance(m_Instances.instance(pair2.first), m_Instances.instance(pair2.second)); if (link1 == InstancePair.MUST_LINK) { if (link2 == InstancePair.CANNOT_LINK) { if (dist1 > dist2) { m_numViolations++; // System.out.println("(" + pair1.first + "," + pair1.second + "): " + link1 + ":" + dist1); // System.out.println("(" + pair2.first + "," + pair2.second + "): " + link2 + ":" + dist2); // System.out.println("Violations: " + m_numViolations); } } } else if (link1 == InstancePair.CANNOT_LINK) { if (link2 == InstancePair.MUST_LINK) { if (dist1 < dist2) { m_numViolations++; // System.out.println("(" + pair1.first + "," + pair1.second + "): " + link1 + ":" + dist1); // System.out.println("(" + pair2.first + "," + pair2.second + "): " + link2 + ":" + dist2); // System.out.println("Violations: " + m_numViolations); } } } } } incoh = (m_numViolations * 1.0) / (numCL * numML); if (fincoh != null) { // fincoh.println((m_Iterations+1) + "\tNumViolations\t" + m_numViolations + "\tNumTotalCL\t" + numCL + "\tNumTotalML\t" + numML); fincoh.println("Iterations\t" + (m_Iterations+1) + "\tIncoh\t" + incoh); } else { System.out.println((m_Iterations+1) + "\tNumViolations\t" + m_numViolations + "\tNumTotalCL\t" + numCL + "\tNumTotalML\t" + numML); } } /** reset the value of the objective function and all of its components */ public void resetObjective() { m_Objective = 0; m_objVariance = 0; m_objCannotLinks = 0; m_objMustLinks = 0; m_objNormalizer = 0; m_objRegularizer = 0; } /** Go through the cannot-link constraints and find the current maximum distance * @return an array of maximum weighted distances. If a single metric is used, maximum distance * is calculated over the entire dataset */ // TODO: non-datasetWide case is not debugged currently!!! protected double[] calculateMaxCLPenalties() throws Exception { double [] maxPenalties = null; double [][] minValues = null; double [][] maxValues = null; int[] attrIdxs = null; maxPenalties = new double[m_NumClusters]; m_maxCLPoints = new Instance[m_NumClusters][2]; m_maxCLDiffInstances = new Instance[m_NumClusters]; for (int i = 0; i < m_NumClusters; i++) { m_maxCLPoints[i][0] = new Instance(m_Instances.numAttributes()); m_maxCLPoints[i][1] = new Instance(m_Instances.numAttributes()); m_maxCLPoints[i][0].setDataset(m_Instances); m_maxCLPoints[i][1].setDataset(m_Instances); m_maxCLDiffInstances[i] = new Instance(m_Instances.numAttributes()); m_maxCLDiffInstances[i].setDataset(m_Instances); } // TEMPORARY PLUG: this was supposed to take care of WeightedDotp, // but it turns out that with weighting similarity can be > 1. // if (m_metric.m_fixedMaxDistance) {// for (int i = 0; i < m_NumClusters; i++) {// maxPenalties[i] = m_metric.getMaxDistance(); // }// return maxPenalties; // } minValues = new double[m_NumClusters][m_metrics[0].getNumAttributes()]; maxValues = new double[m_NumClusters][m_metrics[0].getNumAttributes()]; attrIdxs = m_metrics[0].getAttrIndxs(); // temporary plug: if this if the first iteration when no instances were assigned to clusters, // dataset-wide (not cluster-wide!) minimum and maximum are used even for the case with // multiple metrics boolean datasetWide = true; if (m_useMultipleMetrics && m_Iterations > 0) { datasetWide = false; } // TODO: Mahalanobis - check with getMaxPoints // go through all points if (m_metric instanceof WeightedMahalanobis) { if (m_useMultipleMetrics) { for (int i = 0; i < m_metrics.length; i++) { double[][] maxPoints = ((WeightedMahalanobis)m_metrics[i]).getMaxPoints(m_ConstraintsHash, m_Instances); minValues[i] = maxPoints[0]; maxValues[i] = maxPoints[1]; // System.out.println("Max points " + i); // for (int j = 0; j < maxPoints[0].length; j++) { System.out.println(maxPoints[0][j] + " - " + maxPoints[1][j]);} } } else { double[][] maxPoints = ((WeightedMahalanobis)m_metric).getMaxPoints(m_ConstraintsHash, m_Instances); minValues[0] = maxPoints[0]; maxValues[0] = maxPoints[1]; for (int i = 0; i < m_metrics.length; i++) { minValues[i] = maxPoints[0]; maxValues[i] = maxPoints[1]; } // System.out.println("Max points:"); // for (int i = 0; i < maxPoints[0].length; i++) { System.out.println(maxPoints[0][i] + " - " + maxPoints[1][i]);} } } else { // find the enclosing hypercube for WeightedEuclidean etc. for (int i = 0; i < m_Instances.numInstances(); i++) { Instance instance = m_Instances.instance(i); for (int j = 0; j < attrIdxs.length; j++) { double val = instance.value(attrIdxs[j]); if (datasetWide) { if (val < minValues[0][j]) { minValues[0][j] = val; } if (val > maxValues[0][j]) { maxValues[0][j] = val; } } else { // cluster-specific min's and max's are needed if (val < minValues[m_ClusterAssignments[i]][j]) { minValues[m_ClusterAssignments[i]][j] = val; } if (val > maxValues[m_ClusterAssignments[i]][j]) { maxValues[m_ClusterAssignments[i]][j] = val; } } } } } // get the max/min points if (datasetWide) { for (int i = 0; i < attrIdxs.length; i++) { m_maxCLPoints[0][0].setValue(attrIdxs[i], minValues[0][i]); m_maxCLPoints[0][1].setValue(attrIdxs[i], maxValues[0][i]); } // must copy these over all clusters - just for the first iteration for (int j = 1; j < m_NumClusters; j++) { for (int i = 0; i < attrIdxs.length; i++) { m_maxCLPoints[j][0].setValue(attrIdxs[i], minValues[0][i]); m_maxCLPoints[j][1].setValue(attrIdxs[i], maxValues[0][i]); } } } else { // cluster-specific for (int j = 0; j < m_NumClusters; j++) { for (int i = 0; i < attrIdxs.length; i++) { m_maxCLPoints[j][0].setValue(attrIdxs[i], minValues[j][i]); m_maxCLPoints[j][1].setValue(attrIdxs[i], maxValues[j][i]); } } } // calculate the distances if (datasetWide) { maxPenalties[0] = m_metrics[0].penaltySymmetric(m_maxCLPoints[0][0], m_maxCLPoints[0][1]); m_maxCLDiffInstances[0] = m_metrics[0].createDiffInstance(m_maxCLPoints[0][0], m_maxCLPoints[0][1]); for (int i = 1; i < maxPenalties.length; i++) { maxPenalties[i] = maxPenalties[0]; m_maxCLDiffInstances[i] = m_maxCLDiffInstances[0]; } } else { // cluster-specific - SHOULD BE FIXED!!!! for (int j = 0; j < m_NumClusters; j++) { for (int i = 0; i < attrIdxs.length; i++) { maxPenalties[j] += m_metrics[j].penaltySymmetric(m_maxCLPoints[j][0], m_maxCLPoints[j][1]); m_maxCLDiffInstances[j] = m_metrics[0].createDiffInstance(m_maxCLPoints[j][0], m_maxCLPoints[j][1]); } } } System.out.println("Recomputed max CL penalties"); return maxPenalties; } /** * Checks if instance has to be normalized and classifies the * instance using the current clustering * * @param instance the instance to be assigned to a cluster * @return the number of the assigned cluster as an integer * if the class is enumerated, otherwise the predicted value * @exception Exception if instance could not be classified * successfully */ public int clusterInstance(Instance instance) throws Exception { return assignInstanceToCluster(instance); } /** lookup the instance in the checksum hash, assuming transductive clustering * @param instance instance to be looked up * @return the index of the cluster to which the instance was assigned, -1 if the instance has not bee clustered */ protected int lookupInstanceCluster(Instance instance) throws Exception { int classIdx = instance.classIndex(); double checksum = 0; // need to normalize using original metric, since cluster data is normalized similarly if (m_metric.doesNormalizeData()) { if (m_Trainable == TRAINING_INTERNAL) { m_metric.resetMetric(); } m_metric.normalizeInstanceWeighted(instance); } double[] values1 = instance.toDoubleArray(); for (int i = 0; i < values1.length; i++) { if (i != classIdx) { checksum += m_checksumCoeffs[i] * values1[i]; } } Object list = m_checksumHash.get(new Double((float)checksum)); if (list != null) { // go through the list of instances with the same checksum and find the one that is equivalent ArrayList checksumList = (ArrayList) list; for (int i = 0; i < checksumList.size(); i++) { int instanceIdx = ((Integer) checksumList.get(i)).intValue(); Instance listInstance = m_Instances.instance(instanceIdx); double[] values2 = listInstance.toDoubleArray(); boolean equal = true; for (int j = 0; j < values1.length && equal == true; j++) { if (j != classIdx) { if ((float)values1[j] != (float)values2[j]) { equal = false; } } } if (equal == true) { return m_ClusterAssignments[instanceIdx]; } } } return -1; } /** * Classifies the instances using the current clustering, moves * must-linked points together (Xing's approach) * * @param instIdx the instance index to be assigned to a cluster * @return the number of the assigned cluster as an integer * if the class is enumerated, otherwise the predicted value * @exception Exception if instance could not be classified * successfully */ public int assignAllInstancesToClusters() throws Exception { int numInstances = m_Instances.numInstances(); boolean [] instanceAlreadyAssigned = new boolean[numInstances]; int moved = 0; if (!m_isOfflineMetric) { System.err.println("WARNING!!!\n\nThis code should not be called if metric is not a BarHillelMetric or XingMetric!!!!\n\n"); } for (int i=0; i<numInstances; i++) { instanceAlreadyAssigned[i] = false; } // now process points not in ML meighborhood sets for (int instIdx = 0; instIdx < numInstances; instIdx++) { if (instanceAlreadyAssigned[instIdx]) { continue; // was already in some ML neighborhood } int bestCluster = 0; double bestDistance = Double.POSITIVE_INFINITY; for (int centroidIdx = 0; centroidIdx < m_NumClusters; centroidIdx++) { double sqDistance = m_metric.distance(m_Instances.instance(instIdx), m_ClusterCentroids.instance(centroidIdx)); if (sqDistance < bestDistance) { bestDistance = sqDistance; bestCluster = centroidIdx; } } // accumulate objective function value // m_Objective += bestDistance; // do we need to reassign the point? if (m_ClusterAssignments[instIdx] != bestCluster) { m_ClusterAssignments[instIdx] = bestCluster; instanceAlreadyAssigned[instIdx] = true; moved++; } } return moved; } /** * Classifies the instance using the current clustering, without considering constraints * * @param instance the instance to be assigned to a cluster * @return the number of the assigned cluster as an integer * if the class is enumerated, otherwise the predicted value * @exception Exception if instance could not be classified * successfully */ public int assignInstanceToCluster(Instance instance) throws Exception { int bestCluster = 0; double bestDistance = Double.POSITIVE_INFINITY; double bestSimilarity = Double.NEGATIVE_INFINITY; int lookupCluster; if (m_metric instanceof InstanceConverter) { Instance newInstance = ((InstanceConverter)m_metric).convertInstance(instance); lookupCluster = lookupInstanceCluster(newInstance); } else { lookupCluster = lookupInstanceCluster(instance); } if (lookupCluster >= 0) { return lookupCluster; } throw new Exception ("ACHTUNG!!!\n\nCouldn't lookup the instance!!! Size of hash = " + m_checksumHash.size()); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -