📄 xmeans.java

📁 数据挖掘中聚类的算法
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
   * @return the likelihood estimate   */  protected double logLikelihoodEstimate(int numInst, 				       Instance center, 				       double distortion, 				       int numCent) {    // R(n) num of instances of the center -> numInst    // K num of centers -> not used    //    //todo take the diff comments away    double loglike = 0;    /* if is new */    if (numInst > 1) {      /* diff variance is new */      //      // distortion = Sum over instances x of the center(x-center)      // different to paper; sum should be squared      //      // (Sum of distances to center) / R(n) - 1.0      // different to paper; should be R(n)-K      double variance =  distortion / (numInst - 1.0);         //      //  -R(n)/2 * log(pi*2)      //      double p1 = - (numInst / 2.0) * Math.log(Math.PI * 2.0);      /* diff	 thats how we had it	 double p2 = -((ni * center.numAttributes()) / 2) * distortion;      */      //      // -(R(n)*M)/2 * log(variance)       //      double p2 = - (numInst * center.numAttributes()) / 2 * Math.log(variance);            /* diff	 thats how we had it, the difference is a bug in x-means	 double p3 = - (numInst - numCent) / 2;      */      //      // -(R(n)-1)/2      //      double p3 = - (numInst - 1.0) / 2.0;            //      // R(n)*log(R(n))      //      double p4 = numInst * Math.log(numInst);            /* diff x-means doesn't have this part 	 double p5 = - numInst * Math.log(numInstTotal);      */            /*	loglike = -(ni / 2) * Math.log(Math.PI * 2) 	- (ni * center.numAttributes()) / 2.0) * logdistortion	- (ni - k) / 2.0 	+ ni * Math.log(ni) 	- ni * Math.log(r);      */      loglike = p1 + p2 + p3 + p4; // diff + p5;      //the log(r) is something that can be reused.      //as is the log(2 PI), these could provide extra speed up later on.      //since distortion is so expensive to compute, I only do that once.    }    return loglike;  }    /**   * Calculates the maximum likelihood estimate for the variance.   * @param instOfCent indices of instances to each center   * @param centers the centers   * @return the list of distortions distortion.   */  protected double[] distortion(int[][] instOfCent, Instances centers) {    double[] distortion = new double[centers.numInstances()];    for (int i = 0; i < centers.numInstances(); i++) {      distortion[i] = 0.0;      for (int j = 0; j < instOfCent[i].length; j++) {        distortion[i] += m_DistanceF.distance(m_Instances            .instance(instOfCent[i][j]), centers.instance(i));      }    }    /*     * diff not done in x-means res *= 1.0 / (count - centers.numInstances());     */    return distortion;  }    /**   * Clusters an instance.   *    * @param instance   *          the instance to assign a cluster to.   * @param centers   *          the centers to cluster the instance to.   * @return a cluster index.   */  protected int clusterProcessedInstance(Instance instance, Instances centers) {        double minDist = Integer.MAX_VALUE;    int bestCluster = 0;    for (int i = 0; i < centers.numInstances(); i++) {      double dist = m_DistanceF.distance(instance, centers.instance(i));      if (dist < minDist) {        minDist = dist;        bestCluster = i;      }    }    ;    return bestCluster;  }    /**   * Clusters an instance that has been through the filters.   *    * @param instance   *          the instance to assign a cluster to   * @return a cluster number   */  protected int clusterProcessedInstance(Instance instance) {    double minDist = Integer.MAX_VALUE;    int bestCluster = 0;    for (int i = 0; i < m_NumClusters; i++) {      double dist = m_DistanceF          .distance(instance, m_ClusterCenters.instance(i));      if (dist < minDist) {        minDist = dist;        bestCluster = i;      }    }    return bestCluster;  }  /**   * Classifies a given instance.   *   * @param instance the instance to be assigned to a cluster   * @return the number of the assigned cluster as an integer   * if the class is enumerated, otherwise the predicted value   * @throws Exception if instance could not be classified   * successfully   */  public int clusterInstance(Instance instance) throws Exception {    m_ReplaceMissingFilter.input(instance);    Instance inst = m_ReplaceMissingFilter.output();    return clusterProcessedInstance(inst);  }  /**   * Returns the number of clusters.   *   * @return the number of clusters generated for a training dataset.   */  public int numberOfClusters() {    return m_NumClusters;  }  /**   * Returns an enumeration describing the available options.    * @return an enumeration of all the available options   **/  public Enumeration listOptions() {    Vector result = new Vector();        result.addElement(new Option(	"\tmaximum number of overall iterations\n"	+ "\t(default 1).", 	"I", 1, "-I <num>"));        result.addElement(new Option(	"\tmaximum number of iterations in the kMeans loop in\n"	+ "\tthe Improve-Parameter part \n"	+ "\t(default 1000).", 	"M", 1, "-M <num>"));        result.addElement(new Option(	"\tmaximum number of iterations in the kMeans loop\n"	+ "\tfor the splitted centroids in the Improve-Structure part \n"	+ "\t(default 1000).",	"J", 1, "-J <num>"));        result.addElement(new Option(	"\tminimum number of clusters\n"	+ "\t(default 2).", 	"L", 1, "-L <num>"));        result.addElement(new Option(	"\tmaximum number of clusters\n"	+ "\t(default 4).",	"H", 1, "-H <num>"));        result.addElement(new Option(	"\tdistance value for binary attributes\n"	+ "\t(default 1.0).",	"B", 1, "-B <value>"));        result.addElement(new Option(	"\tUses the KDTree internally\n"	+ "\t(default no).",	"use-kdtree", 0, "-use-kdtree"));        result.addElement(new Option(	"\tFull class name of KDTree class to use, followed\n"	+ "\tby scheme options.\n"	+ "\teg: \"weka.core.neighboursearch.kdtrees.KDTree -P\"\n"	+ "\t(default no KDTree class used).",	"K", 1, "-K <KDTree class specification>"));        result.addElement(new Option(	"\tcutoff factor, takes the given percentage of the splitted \n"	+ "\tcentroids if none of the children win\n"	+ "\t(default 0.0).",	"C", 1, "-C <value>"));        result.addElement(new Option(	"\tFull class name of Distance function class to use, followed\n"	+ "\tby scheme options.\n" +	"\t(default weka.core.EuclideanDistance).",	"D", 1, "-D <distance function class specification>"));        result.addElement(new Option(	"\tfile to read starting centers from (ARFF format).",	"N", 1, "-N <file name>"));        result.addElement(new Option(	"\tfile to write centers to (ARFF format).",	"O", 1, "-O <file name>"));        result.addElement(new Option(	"\tThe debug level.\n"	+ "\t(default 0)",	"U", 1, "-U <int>"));        result.addElement(new Option(	"\tThe debug vectors file.",	"Y", 1, "-Y <file name>"));        Enumeration en = super.listOptions();    while (en.hasMoreElements())      result.addElement(en.nextElement());        return result.elements();  }  /**   * Returns the tip text for this property.   * @return tip text for this property    */  public String minNumClustersTipText() {    return "set minimum number of clusters";  }  /**   * Sets the minimum number of clusters to generate.   *   * @param n the minimum number of clusters to generate   */  public void setMinNumClusters(int n) {    if (n <= m_MaxNumClusters) {      m_MinNumClusters = n;    }  }  /**   * Gets the minimum number of clusters to generate.   * @return the minimum number of clusters to generate   */  public int getMinNumClusters() {    return m_MinNumClusters;  }  /**   * Returns the tip text for this property.   * @return tip text for this property    */  public String maxNumClustersTipText() {    return "set maximum number of clusters";  }  /**   * Sets the maximum number of clusters to generate.   * @param n the maximum number of clusters to generate   */  public void setMaxNumClusters(int n) {    if (n >= m_MinNumClusters) {      m_MaxNumClusters = n;    }  }    /**   * Gets the maximum number of clusters to generate.   * @return the maximum number of clusters to generate   */  public int getMaxNumClusters() {    return m_MaxNumClusters;  }  /**   * Returns the tip text for this property.   * @return tip text for this property    */  public String maxIterationsTipText() {    return "the maximum number of iterations to perform";  }  /**   * Sets the maximum number of iterations to perform.   * @param i the number of iterations   * @throws Exception if i is less than 1   */  public void setMaxIterations(int i) throws Exception {    if (i < 0)       throw new Exception("Only positive values for iteration number" +                           " allowed (Option I).");     m_MaxIterations = i;  }  /**   * Gets the maximum number of iterations.   * @return the number of iterations   */  public int getMaxIterations() {    return  m_MaxIterations;  }  /**   * Returns the tip text for this property.   * @return tip text for this property    */  public String maxKMeansTipText() {    return "the maximum number of iterations to perform in KMeans";  }  /**   * Set the maximum number of iterations to perform in KMeans.   * @param i the number of iterations   */  public void setMaxKMeans(int i) {    m_MaxKMeans = i;    m_MaxKMeansForChildren = i;  }  /**   * Gets the maximum number of iterations in KMeans.   * @return the number of iterations   */  public int getMaxKMeans() {    return  m_MaxKMeans;  }  /**   * Returns the tip text for this property.   * @return tip text for this property    */  public String maxKMeansForChildrenTipText() {    return "the maximum number of iterations KMeans that is performed on the child centers";  }  /**   * Sets the maximum number of iterations KMeans that is performed    * on the child centers.   * @param i the number of iterations   */  public void setMaxKMeansForChildren(int i) {    m_MaxKMeansForChildren = i;  }  /**   * Gets the maximum number of iterations in KMeans.   * @return the number of iterations   */  public int getMaxKMeansForChildren() {    return  m_MaxKMeansForChildren;  }  /**   * Returns the tip text for this property.   * @return tip text for this property    */  public String cutOffFactorTipText() {    return "the cut-off factor to use";  }  /**   * Sets a new cutoff factor.   * @param i the new cutoff factor   */  public void setCutOffFactor(double i) {    m_CutOffFactor = i;  }  /**   * Gets the cutoff factor.   * @return the cutoff factor   */  public double getCutOffFactor() {    return  m_CutOffFactor;  }  /**   * Returns the tip text for this property.   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String binValueTipText() {    return "Set the value that represents true in the new attributes.";  }    /**   * Gets value that represents true in a new numeric attribute.   * (False is always represented by 0.0.)   * @return the value that represents true in a new numeric attribute   */  public double getBinValue() {    return m_BinValue;  }  /**   * Sets the distance value between true and false of binary attributes.   * and  "same" and "different" of nominal attributes       * @param value the distance   */  public void setBinValue(double value) {    m_BinValue = value;  }  /**   * Returns the tip text for this property.   *    * @return 		tip text for this property suitable for   * 			displaying in the explorer/experimenter gui   */  public String distanceFTipText() {    return "The distance function to use.";  }  /**   * gets the "binary" distance value.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -