📄 xmeans.java

📁 Java 编写的多种数据挖掘算法包括聚类、分类、预处理等
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
   * @param numInst number of instances that belong to the center   * @param center the center   * @param distortion distortion    * @param numCent number of centers    * @return the likelihood estimate   */  private double logLikelihoodEstimate(int numInst, 				       Instance center, 				       double distortion, 				       int numCent) {    // R(n) num of instances of the center -> numInst    // K num of centers -> not used    //    //todo take the diff comments away    double loglike = 0;    /* if is new */    if (numInst > 1) {      /* diff variance is new */      //      // distortion = Sum over instances x of the center(x-center)      // different to paper; sum should be squared      //      // (Sum of distances to center) / R(n) - 1.0      // different to paper; should be R(n)-K      double variance =  distortion / (numInst - 1.0);         //      //  -R(n)/2 * log(pi*2)      //      double p1 = - (numInst / 2.0) * Math.log(Math.PI * 2.0);      /* diff	 thats how we had it	 double p2 = -((ni * center.numAttributes()) / 2) * distortion;      */      //      // -(R(n)*M)/2 * log(variance)       //      double p2 = - (numInst * center.numAttributes()) / 2 * Math.log(variance);            /* diff	 thats how we had it, the difference is a bug in x-means	 double p3 = - (numInst - numCent) / 2;      */      //      // -(R(n)-1)/2      //      double p3 = - (numInst - 1.0) / 2.0;            //      // R(n)*log(R(n))      //      double p4 = numInst * Math.log(numInst);            /* diff x-means doesn't have this part 	 double p5 = - numInst * Math.log(numInstTotal);      */            /*	loglike = -(ni / 2) * Math.log(Math.PI * 2) 	- (ni * center.numAttributes()) / 2.0) * logdistortion	- (ni - k) / 2.0 	+ ni * Math.log(ni) 	- ni * Math.log(r);      */      loglike = p1 + p2 + p3 + p4; // diff + p5;      //the log(r) is something that can be reused.      //as is the log(2 PI), these could provide extra speed up later on.      //since distortion is so expensive to compute, I only do that once.    }    return loglike;  }    /**   * Calculates the maximum likelihood estimate for the variance.   * @param instOfCent indices of instances to each center   * @param centers the centers   * @return the list of distortions distortion.   */  private double [] distortion(int[][] instOfCent, Instances centers)     throws Exception {    double [] distortion = new double [centers.numInstances()];    for (int i = 0; i < centers.numInstances(); i++) {      distortion[i] = 0.0;      for (int j = 0; j < instOfCent[i].length; j++) {	distortion[i] += m_DistanceF.distance(                                 m_Instances.instance(instOfCent[i][j]), 				 centers.instance(i));      }    }    /* diff not done in x-means    res *= 1.0 / (count - centers.numInstances());    */    return distortion;  }    /**   * Clusters an instance.   * @param instance the instance to assign a cluster to.   * @param centers the centers to cluster the instance to.   * @return a cluster index.   */  private int clusterProcessedInstance(Instance instance, Instances centers)throws Exception{        double minDist = Integer.MAX_VALUE;    int bestCluster = 0;    for (int i = 0; i < centers.numInstances(); i++) {      double dist = m_DistanceF.distance(instance, centers.instance(i));      if (dist < minDist) {	minDist = dist;     	bestCluster = i;          }                         };                             return bestCluster;  }    /**   * Clusters an instance that has been through the filters.   *   * @param instance the instance to assign a cluster to   * @return a cluster number   */  private int clusterProcessedInstance(Instance instance) throws Exception {    double minDist = Integer.MAX_VALUE;    int bestCluster = 0;    for (int i = 0; i < m_NumClusters; i++) {      double dist = m_DistanceF.distance(instance, m_ClusterCenters.instance(i));      if (dist < minDist) {	minDist = dist;	bestCluster = i;      }    }    return bestCluster;  }  /**   * Classifies a given instance.   *   * @param instance the instance to be assigned to a cluster   * @return the number of the assigned cluster as an integer   * if the class is enumerated, otherwise the predicted value   * @throws if instance could not be classified   * successfully   */  public int clusterInstance(Instance instance) throws Exception {    m_ReplaceMissingFilter.input(instance);    Instance inst = m_ReplaceMissingFilter.output();    return clusterProcessedInstance(inst);  }  /**   * Returns the number of clusters.   *   * @return the number of clusters generated for a training dataset.   */  public int numberOfClusters() {    return m_NumClusters;  }  /**   * Returns an enumeration describing the available options.    * @return an enumeration of all the available options   **/  public Enumeration listOptions() {    Vector newVector = new Vector(4);     newVector.addElement(new Option(       "\tmaximum number of overall iterations\n" +       "\t(default = 1).",        "I", 1, "-I <num>"));     newVector.addElement(new Option(       "\tmaximum number of iterations in the kMeans loop in\n" +       "\tthe Improve-Parameter part \n"+       "\t(default = 1000).",        "M", 1, "-M <num>"));     newVector.addElement(new Option(       "\tmaximum number of iterations in the kMeans loop\n" +       "\tfor the splitted centroids in the Improve-Structure part \n"+       "\t(default = 1000).",       "J", 1, "-J <num>"));     newVector.addElement(new Option(       "\tminimum number of clusters\n" +       "\t(default = 2).",        "L", 1, "-L <num>"));     newVector.addElement(new Option(       "\tmaximum number of clusters\n" +       "\t(default = 4).",       "H", 1, "-H <num>"));     newVector.addElement(new Option(       "\tdistance value for binary attributes\n" +       "\t(default = 1.0).",       "V", 1, "-V <value>"));     newVector.addElement(new Option(       "\tFull class name of KDTree class to use, followed\n" +       "\tby scheme options.\n" +       "\teg: \"weka.core.KDTree -P\"\n" +       "\t(default = no KDTree class used).",       "K", 1, "-K <KDTree class specification>"));     newVector.addElement(new Option(       "\tcutoff factor, takes the given percentage of the splitted \n" +       "\tcentroids if none of the children win\n" +       "\t(default = 0.0).",       "C", 1, "-C <value>"));     newVector.addElement(new Option(       "\tFull class name of Distance function class to use, followed\n" +       "\tby scheme options.\n" +       "\teg: \"weka.core.MahalanobisDistance\"\n" +       "\t(default = weka.core.EuclideanDistance).",       "K", 1, "-K <distance function class specification>"));     newVector.addElement(new Option(       "\tfile to read starting centers from (ARFF format).",       "N", 1, "-N <file name>"));     newVector.addElement(new Option(       "\tfile to write centers to (ARFF format).",       "O", 1, "-O <file name>"));     newVector.addElement(new Option(       "\trandom number seed (default 10).",       "S", 1, "-S <num>"));     return  newVector.elements();  }  /**   * Returns the tip text for this property   * @return tip text for this property    */  public String minNumClustersTipText() {    return "set minimum number of clusters";  }  /**   * Returns the tip text for this property   * @return tip text for this property    */  public String maxNumClustersTipText() {    return "set maximum number of clusters";  }  /**   * Sets the maximum number of iterations to perform.   * @param i the number of iterations   * @throws Exception if i is less than 1   */  public void setMaxIterations(int i) throws Exception {    if (i < 0)       throw new Exception("Only positive values for iteration number" +                           " allowed (Option I).");     m_MaxIterations = i;  }  /**   * Gets the maximum number of iterations.   * @return the number of iterations   */  public int getMaxIterations() {    return  m_MaxIterations;  }  /**   * Set the maximum number of iterations to perform in KMeans   * @param i the number of iterations   */  public void setMaxKMeans(int i) {    m_MaxKMeans = i;    m_MaxKMeansForChildren = i;  }  /**   * Gets the maximum number of iterations in KMeans.   * @return the number of iterations   */  public int getMaxKMeans() {    return  m_MaxKMeans;  }  /**   * Sets the maximum number of iterations KMeans that is performed    * on the child centers.   * @param i the number of iterations   */  public void setMaxKMeansForChildren(int i) throws Exception {    m_MaxKMeansForChildren = i;  }  /**   * Gets the maximum number of iterations in KMeans.   * @return the number of iterations   */  public int getMaxKMeansForChildren() {    return  m_MaxKMeansForChildren;  }  /**   * Sets a new cutoff factor.   * @param i the new cutoff factor   */  public void setCutOffFactor(double i) throws Exception {    m_CutOffFactor = i;  }  /**   * Gets the cutoff factor.   * @return the cutoff factor   */  public double getCutOffFactor() {    return  m_CutOffFactor;  }  /**   * Sets the minimum number of clusters to generate.   *   * @param n the minimum number of clusters to generate   */  public void setMinNumClusters(int n) {    if (n <= m_MaxNumClusters) {      m_MinNumClusters = n;    }  }  /**   * Sets the maximum number of clusters to generate.   * @param n the maximum number of clusters to generate   */  public void setMaxNumClusters(int n) {    if (n >= m_MinNumClusters) {      m_MaxNumClusters = n;    }  }  /**   * Returns the tip text for this property   * @return tip text for this property suitable for   * displaying in the explorer/experimenter gui   */  public String binValueTipText() {    return "Set the value that represents true in the new attributes.";  }  /**   * Gets value that represents true in a new numeric attribute.   * (False is always represented by 0.0.)   * @return the value that represents true in a new numeric attribute   */  public double getBinValue() {    return m_BinValue;  }  /**   * Sets the distance e value between true and false of binary attributes    * and  "same" and "different" of nominal attributes       * @param double value   */  public void setBinValue(double value) {    m_BinValue = value;  }  /**   * gets the "binary" distance value    * @param distanceF the distance function with all options set   */  public void setDistanceF(EuclideanDistance distanceF) {    m_DistanceF = distanceF;  }  /**   * Gets the distance function.   * @return the distance function   */  public EuclideanDistance getDistanceF() {    return m_DistanceF;  }  /**   * Gets the distance function specification string, which contains the    * class name of the distance function class and any options to it   *   * @return the distance function specification string   */  protected String getDistanceFSpec() {        EuclideanDistance d = getDistanceF();    if (d instanceof OptionHandler) {      return d.getClass().getName() + " "	+ Utils.joinOptions(((OptionHandler) d).getOptions());    }    return d.getClass().getName();  }  /**   * Sets a file name for a file that has the random vektors stored.   * Just used for debugging reasons.   * @param fileName file name for the file to read the random vektors from   */  public void setDebugVektorsFile(String fileName) {    m_DebugVektorsFile = fileName;  }  /**   * Initialises the debug vektor input.   */  public void initDebugVektorsInput() throws Exception {    m_DebugVektorsInput =       new BufferedReader(new FileReader(m_DebugVektorsFile));    m_DebugVektors = new Instances(m_DebugVektorsInput);    m_DebugVektorsIndex = 0;  }  /**   * Read an instance from debug vektors file.   * @param model the data model for the instance   */  public Instance getNextDebugVektorsInstance(Instances model)     throws Exception {    if (m_DebugVektorsIndex >= m_DebugVektors.numInstances())      throw new Exception("no more prefabricated Vektors");    Instance nex = m_DebugVektors.instance(m_DebugVektorsIndex);    nex.setDataset(model);    m_DebugVektorsIndex++;    return nex;  }  /**   * Sets the name of the file to read the list of centers from.   *   * @param fileName file name of file to read centers from   */  public void setInputCenterFile(String fileName) {    m_InputCenterFile = fileName;  }
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -