📄 hac.java
字号:
default: throw new Exception("Unknown linkage type!"); } return distance; } /** * set the verbosity level of the clusterer * @param verbose messages on(true) or off (false) */ public void setVerbose (boolean verbose) { m_verbose = verbose; } /** * get the verbosity level of the clusterer * @return verbose messages on(true) or off (false) */ public boolean getVerbose () { return m_verbose; } /** * Returns an enumeration describing the available options * * @return an enumeration of all the available options **/ public Enumeration listOptions() { Vector newVector = new Vector(2); newVector.addElement(new Option("\tThreshold.\n" +"\t(default=MAX_DOUBLE)", "T", 1,"-T <0-MAX_DOUBLE>")); newVector.addElement(new Option("\tNumber of clusters.\n" +"a\t(default=-1)", "N", 1,"-N <-1-MAX_INT100%>")); return newVector.elements(); } /** * Parses a given list of options. * * Valid options are:<p> * * -A <0-100> <br> * Acuity. <p> * * -C <0-100> <br> * Cutoff. <p> * * @param options the list of options as an array of strings * @exception Exception if an option is not supported * **/ public void setOptions(String[] options) throws Exception { String optionString; optionString = Utils.getOption('N', options); if (optionString.length() != 0) { setNumClusters(Integer.parseInt(optionString)); } } /** * Gets the current settings of Greedy Agglomerative Clustering * * @return an array of strings suitable for passing to setOptions() */ public String [] getOptions() { String [] options = new String [70]; int current = 0; options[current++] = "-N"; options[current++] = "" + m_numClusters; if (m_linkingType == SINGLE_LINK) { options[current++] = "-I"; } else if (m_linkingType == COMPLETE_LINK) { options[current++] = "-C"; } else if (m_linkingType == GROUP_AVERAGE) { options[current++] = "-G"; } if (m_seedable) { options[current++] = "-S"; } options[current++] = "-M"; options[current++] = m_metric.getClass().getName(); if (m_metric instanceof OptionHandler) { String[] metricOptions = ((OptionHandler)m_metric).getOptions(); for (int i = 0; i < metricOptions.length; i++) { options[current++] = metricOptions[i]; } } while (current < options.length) { options[current++] = ""; } return options; } /** * Train the clusterer using specified parameters * * @param instances Instances to be used for training */ public void trainClusterer (Instances instances) throws Exception { if (m_metric instanceof LearnableMetric) { if (((LearnableMetric)m_metric).getTrainable()) { ((LearnableMetric)m_metric).learnMetric(instances); } else { throw new Exception ("Metric is not trainable"); } } else { throw new Exception ("Metric is not trainable"); } } /** returns objective function, needed for compatibility with SemiSupClusterer */ public double objectiveFunction() { return Double.NaN; } /** Return the number of clusters */ public int getNumClusters() { return m_numClusters; } /** A duplicate function to conform to Clusterer abstract class. * @returns the number of clusters */ public int numberOfClusters() { return getNumClusters(); } /** * get an array of random indeces out of n possible values. * if the number of requested indeces is larger then maxIdx, returns * maxIdx permuted values * @param maxIdx - the maximum index of the set * @param numIdxs number of indexes to return * @return an array of indexes */ public static int[] randomSubset(int numIdxs, int maxIdx) { Random r = new Random(maxIdx + numIdxs); int[] indeces = new int[maxIdx]; for (int i = 0; i < maxIdx; i++) { indeces[i] = i; } // permute the indeces randomly for (int i = 0; i < indeces.length; i++) { int idx = r.nextInt (maxIdx); int temp = indeces[idx]; indeces[idx] = indeces[i]; indeces[i] = temp; } int []returnIdxs = new int[Math.min(numIdxs,maxIdx)]; for (int i = 0; i < returnIdxs.length; i++) { returnIdxs[i] = indeces[i]; } return returnIdxs; } // Main method for testing this class public static void main(String [] argv) { try { //////// Iris data //String datafile = "/u/ml/software/weka-latest/data/iris.arff"; // String datafile = "/u/mbilenko/ml/tivoli/user-features-GroupClassGrad.arff"; String datafile = "/u/mbilenko/ml/tivoli/data/user-features-processClass.arff"; // String datafile = "/u/mbilenko/weka/data/glass.arff"; // set up the data FileReader reader = new FileReader (datafile); Instances data = new Instances (reader); // filter out bad attributes for tivoli clustering String [] filteredProcesses = {"pico", "twm", "Xvnc", "lpr", "fvwm2", "xclock", "FvwmButtons", "FvwmPager", "ymessenger.bin", "vim", "vi", "xemacs", "xscreensaver", "gnome-panel", "gnome-settings-daemon", "gconfd-2", "xlock", "kdesud", "ssh", "tasklist_applet", "panel", "gnome-session", "gnome-smproxy", "MozillaFirebird-bin", "nautilus", "mutt", "mixer_applet2", "metacity", "bonobo-activation-server", "csh", "nautilus-throbber", "xmms", "realplay", "konqueror", "knode", "kdesktop_lock", "kwrapper", "artsd", "esd", "gnome-panel", "gnome-terminal", "mail", "gnome-name-service", "deskguide_applet", "sawfish", "gaim", "konsole", "opera", "enlightenment", "6", "wmaker"}; System.out.println("filtered=" + filteredProcesses.length); int[] descrIndeces = new int[filteredProcesses.length]; for (int i = 0; i < descrIndeces.length; i++) { Attribute attr = data.attribute(filteredProcesses[i]); System.out.println(i + ": " + attr); descrIndeces[i] = attr.index(); } Remove attributeFilter = new Remove(); attributeFilter.setAttributeIndicesArray(descrIndeces); attributeFilter.setInvertSelection(false); attributeFilter.setInputFormat(data); data = Filter.useFilter(data, attributeFilter); // Make the last attribute be the class int theClass = data.numAttributes(); data.setClassIndex(theClass-1); // starts with 0 // int numClusters = data.numClasses(); Instances clusterData = new Instances(data); clusterData.deleteClassAttribute(); WeightedEuclidean euclidean = new WeightedEuclidean(clusterData.numAttributes()); WeightedDotP dotp = new WeightedDotP(clusterData.numAttributes()); // HAC hac = new HAC(euclidean); HAC hac = new HAC(dotp); hac.setVerbose(false); clusterData = hac.filterInstanceDescriptions(clusterData); // cluster without seeding System.out.println("\nClustering the user data ...\n"); hac.setLinkingType(new SelectedTag(COMPLETE_LINK, TAGS_LINKING)); // trim the instances // int i = 6; // while (i < clusterData.numInstances()) { // clusterData.delete(i); //} // cluster with seeding // ArrayList seedArray = new ArrayList(); // for (int i = 0; i < 19; i++) { // seedArray.add(clusterData.instance(i)); // }// seedArray.add(clusterData.instance(0));// seedArray.add(clusterData.instance(1));// seedArray.add(clusterData.instance(2));// seedArray.add(clusterData.instance(3));// seedArray.add(clusterData.instance(4)); // seedArray.add(clusterData.instance(50));// seedArray.add(clusterData.instance(51));// seedArray.add(clusterData.instance(52));// seedArray.add(clusterData.instance(53));// seedArray.add(clusterData.instance(54));// seedArray.add(clusterData.instance(100));// seedArray.add(clusterData.instance(101));// seedArray.add(clusterData.instance(102));// seedArray.add(clusterData.instance(103));// seedArray.add(clusterData.instance(104));// Seeder seeder = new Seeder(clusterData, data);// seeder.setVerbose(false);// seeder.createSeeds(seedArray);// HashMap seedHash = seeder.getSeeds();// hac.setSeedHash(seedHash); HashMap classInstanceHash = new HashMap(); // get the data for each class for (int i = 0; i < data.numInstances(); i++) { Instance instance = data.instance(i); Integer classValue = new Integer((int) instance.classValue()); if (classInstanceHash.containsKey(classValue)) { ArrayList classList = (ArrayList) classInstanceHash.get(classValue); classList.add(new Integer(i)); System.out.println("Seen class; now has " + classList.size() + " elements"); } else { // unseen class System.out.println("Unseen class " + classValue); ArrayList classList = new ArrayList(); classList.add(new Integer(i)); classInstanceHash.put(classValue, classList); } } // sample from the classes that have more than 1 instance double seedProportion = 0.7; ArrayList seedArray = new ArrayList(); Iterator iterator = classInstanceHash.entrySet().iterator(); while (iterator.hasNext()) { Map.Entry entry = (Map.Entry) iterator.next(); ArrayList classList = (ArrayList) entry.getValue(); System.out.println("Classlist for " + entry.getKey() + " has " + classList.size() + " elements\n"); if (classList.size() > 1) { int [] seedIndeces = randomSubset((int) ((classList.size() + 0.0) * seedProportion), classList.size()); System.out.println("Seeding for class " + entry.getKey() + " using " + seedIndeces.length); for (int i = 0; i < seedIndeces.length; i++) { seedArray.add(clusterData.instance(((Integer)(classList.get(seedIndeces[i]))).intValue())); System.out.println("Adding seed " + classList.get(seedIndeces[i])); } } } Seeder seeder = new Seeder(clusterData, data); seeder.setVerbose(false); seeder.createSeeds(seedArray); HashMap seedHash = seeder.getSeeds(); hac.setSeedHash(seedHash); hac.buildClusterer(clusterData, 1); hac.printClusters();// System.out.println("Cluster assignments: ");// for (int i=0; i < hac.m_clusterAssignments.length; i++) {// System.out.print(i + ":" + hac.m_clusterAssignments[i] + " ");// }// System.out.println("\n\n");// for (int j = 0; j < clusterData.numInstances(); j++) {// System.out.println(j + ":" + hac.clusterInstance(clusterData.instance(j)));// } //////////////////////////////////////////////////// // HI-DIM TESTING //////////////////////////////////////////////////// //////// Text data - 300 documents// datafile = "/u/ml/software/weka-latest/data/20newsgroups/different-100_fromCCS.arff";// System.out.println("\nClustering diff-100 newsgroup data with seeding, using constrained HAC...\n"); // // set up the data// reader = new FileReader (datafile);// data = new Instances (reader);// System.out.println("Initial data has size: " + data.numInstances());// // Make the last attribute be the class // theClass = data.numAttributes();// data.setClassIndex(theClass-1); // starts with 0// numClusters = data.numClasses(); // WeightedDotP dotp = new WeightedDotP(data.numAttributes());// hac = new HAC (dotp);// // cluster with seeding // Instances seeds = new Instances(data, 0, 5);// seeds.add(data.instance(100));// seeds.add(data.instance(101));// seeds.add(data.instance(102));// seeds.add(data.instance(103));// seeds.add(data.instance(104));// seeds.add(data.instance(200));// seeds.add(data.instance(201));// seeds.add(data.instance(202));// seeds.add(data.instance(203));// seeds.add(data.instance(204));// System.out.println("Labeled data has size: " + seeds.numInstances() + ", number of attributes: " + data.numAttributes());// data.delete(204);// data.delete(203);// data.delete(202);// data.delete(201);// data.delete(200);// data.delete(104);// data.delete(103);// data.delete(102);// data.delete(101);// data.delete(100);// data.delete(4);// data.delete(3);// data.delete(2);// data.delete(1);// data.delete(0);// System.out.println("Unlabeled data has size: " + data.numInstances());// // Remove the class labels before clustering// clusterData = new Instances(data);// // clusterData.deleteAttributeAt(theClass-1);// clusterData.deleteClassAttribute();// hac.setVerbose(false);// hac.setSeedable(true);// hac.buildClusterer(seeds, clusterData, theClass, numClusters); } catch (Exception e) { e.printStackTrace(); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -