📄 clusterevaluation.java

📁 一个数据挖掘系统的源码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
	    throw new Exception("Can only do class based evaluation on the "
				+"training data");
	  }

	  if (objectInputFileName.length() != 0) {
	    throw new Exception("Can't load a clusterer and do class based "
				+"evaluation");
	  }
	}

	if (theClass != -1) {
	  if (theClass < 1
	      || theClass > train.numAttributes()) {
	    throw new Exception("Class is out of range!");
	  }
	  if (!train.attribute(theClass-1).isNominal()) {
	    throw new Exception("Class must be nominal!");
	  }
	  train.setClassIndex(theClass-1);
	}

      if (objectInputFileName.length() != 0) {
	objectInputStream = new ObjectInputStream(new FileInputStream(objectInputFileName));
      }

      if (objectOutputFileName.length() != 0) {
	objectOutputStream = new
	  ObjectOutputStream(new FileOutputStream(objectOutputFileName));
      }
    }
    catch (Exception e) {
      throw  new Exception("ClusterEvaluation: " + e.getMessage() + '.');
    }

    // Save options
    if (options != null) {
      savedOptions = new String[options.length];
      System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    if (objectInputFileName.length() != 0) {
      Utils.checkForRemainingOptions(options);
    }

    // Set options for clusterer
    if (clusterer instanceof OptionHandler) {
      ((OptionHandler)clusterer).setOptions(options);
    }

    Utils.checkForRemainingOptions(options);

    if (objectInputFileName.length() != 0) {
      // Load the clusterer from file
      clusterer = (Clusterer)objectInputStream.readObject();
      objectInputStream.close();
    }
    else {
      // Build the clusterer if no object file provided
      if (theClass == -1) {
	clusterer.buildClusterer(train);
      } else {
	AttributeFilter removeClass = new AttributeFilter();
	removeClass.setAttributeIndices(""+theClass);
	removeClass.setInvertSelection(false);
	removeClass.setInputFormat(train);
	Instances clusterTrain = Filter.useFilter(train, removeClass);
	clusterer.buildClusterer(clusterTrain);
	ClusterEvaluation ce = new ClusterEvaluation();
	ce.setClusterer(clusterer);
	ce.evaluateClusterer(train);

	return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString();
      }
    }

    /* Output cluster predictions only (for the test data if specified,
       otherwise for the training data */
    if (printClusterAssignments) {
      return  printClusterings(clusterer, train, testFileName, attributesToOutput);
    }

    text.append(clusterer.toString());
    text.append("\n\n=== Clustering stats for training data ===\n\n"
      + printClusterStats(clusterer, data));

    if (testFileName.length() != 0) {
      text.append("\n\n=== Clustering stats for testing data ===\n\n"
		  + printClusterStats(clusterer, testFileName));
    }

    if ((clusterer instanceof DistributionClusterer) &&
	(doXval == true) &&
	(testFileName.length() == 0) &&
	(objectInputFileName.length() == 0)) {
      // cross validate the log likelihood on the training data
      random = new Random(seed);
      random.setSeed(seed);
      train.randomize(random);
      text.append(crossValidateModel(clusterer.getClass().getName()
				     , train, folds, savedOptions));
    }

    // Save the clusterer if an object output file is provided
    if (objectOutputFileName.length() != 0) {
      objectOutputStream.writeObject(clusterer);
      objectOutputStream.flush();
      objectOutputStream.close();
    }

    return  text.toString();
  }

/**
 * Evaluates a clusterer for percentage split
 *
 * Evaluates a clusterer with the options given in an array of
 * strings. It takes the string indicated by "-t" as training file, the
  * string indicated by "-T" as test file.
   * If the test file is missing, a stratified ten-fold
   * cross-validation is performed (distribution clusterers only).
   * Using "-x" you can change the number of
   * folds to be used, and using "-s" the random seed.
   * If the "-p" option is present it outputs the classification for
   * each test instance. If you provide the name of an object file using
   * "-l", a clusterer will be loaded from the given file. If you provide the
   * name of an object file using "-d", the clusterer built from the
   * training data will be saved to the given file.
   *
   * @param clusterer machine learning clusterer
   * @param options the array of string containing the options
   * @exception Exception if model could not be evaluated successfully
   * @return a string describing the results
   */
  public static String evaluateClustererInstancePercentage (Clusterer clusterer,
					  String[] options, Instances data, int percentage)
    throws Exception
  {
    int seed = 1, folds = 10, percent = 0, sizeOfTrainFile = 0, sizeOfTestFile = 0;
    boolean doXval = false;
    Instances train = null;
    Instances template = null;
    Instances test = null;
    Random random;
    String testFileName, seedString, foldsString, objectInputFileName, objectOutputFileName, attributeRangeString;
    String[] savedOptions = null;
    boolean printClusterAssignments = false;
    Range attributesToOutput = null;
    ObjectInputStream objectInputStream = null;
    ObjectOutputStream objectOutputStream = null;
    StringBuffer text = new StringBuffer();
//    train = data;
    int theClass = -1; // class based evaluation of clustering

    try {
      percent = percentage;
      sizeOfTrainFile = data.numInstances() * percent/100;
      sizeOfTestFile = data.numInstances() - sizeOfTrainFile;
      train = new Instances (data, 0, sizeOfTrainFile);
      test = new Instances (data, sizeOfTrainFile, sizeOfTestFile);
      if (Utils.getFlag('h', options)) {
        throw  new Exception("Help requested.");
      }

      // Get basic options (options the same for all clusterers
      //printClusterAssignments = Utils.getFlag('p', options);
      objectInputFileName = Utils.getOption('l', options);
      objectOutputFileName = Utils.getOption('d', options);
      testFileName = Utils.getOption('T', options);

      // Check -p option
      try {
	attributeRangeString = Utils.getOption('p', options);
      }
      catch (Exception e) {
	throw new Exception(e.getMessage() + "\nNOTE: the -p option has changed. " +
			    "It now expects a parameter specifying a range of attributes " +
			    "to list with the predictions. Use '-p 0' for none.");
      }
      if (attributeRangeString.length() != 0) {
	printClusterAssignments = true;
	if (!attributeRangeString.equals("0"))
	  attributesToOutput = new Range(attributeRangeString);
      }

      if ((objectInputFileName.length() != 0)
	    && (printClusterAssignments == false)) {
	  throw  new Exception("Can't use both train and model file "
			       + "unless -p specified.");
      }


      seedString = Utils.getOption('s', options);

      if (seedString.length() != 0) {
	seed = Integer.parseInt(seedString);
      }

      foldsString = Utils.getOption('x', options);

      if (foldsString.length() != 0) {
	folds = Integer.parseInt(foldsString);
	doXval = true;
      }
    }
    catch (Exception e) {
      throw  new Exception('\n' + e.getMessage()
			   + makeOptionString(clusterer));
    }

    try {
	String classString = Utils.getOption('c',options);
	if (classString.length() != 0) {
	  if (classString.compareTo("last") == 0) {
	    theClass = train.numAttributes();
	  } else if (classString.compareTo("first") == 0) {
	    theClass = 1;
	  } else {
	    theClass = Integer.parseInt(classString);
	  }
	  if (doXval || testFileName.length() != 0) {
	    throw new Exception("Can only do class based evaluation on the "
				+"training data");
	  }

	  if (objectInputFileName.length() != 0) {
	    throw new Exception("Can't load a clusterer and do class based "
				+"evaluation");
	  }
	}

	if (theClass != -1) {
	  if (theClass < 1
	      || theClass > train.numAttributes()) {
	    throw new Exception("Class is out of range!");
	  }
	  if (!train.attribute(theClass-1).isNominal()) {
	    throw new Exception("Class must be nominal!");
	  }
	  train.setClassIndex(theClass-1);
	}

      if (objectInputFileName.length() != 0) {
	objectInputStream = new ObjectInputStream(new FileInputStream(objectInputFileName));
      }

      if (objectOutputFileName.length() != 0) {
	objectOutputStream = new
	  ObjectOutputStream(new FileOutputStream(objectOutputFileName));
      }
    }
    catch (Exception e) {
      throw  new Exception("ClusterEvaluation: " + e.getMessage() + '.');
    }

    // Save options
    if (options != null) {
      savedOptions = new String[options.length];
      System.arraycopy(options, 0, savedOptions, 0, options.length);
    }

    if (objectInputFileName.length() != 0) {
      Utils.checkForRemainingOptions(options);
    }

    // Set options for clusterer
    if (clusterer instanceof OptionHandler) {
      ((OptionHandler)clusterer).setOptions(options);
    }

    Utils.checkForRemainingOptions(options);

    if (objectInputFileName.length() != 0) {
      // Load the clusterer from file
      clusterer = (Clusterer)objectInputStream.readObject();
      objectInputStream.close();
    }
    else {
      // Build the clusterer if no object file provided
      if (theClass == -1) {
	clusterer.buildClusterer(train);
      }
      else {
	AttributeFilter removeClass = new AttributeFilter();
	removeClass.setAttributeIndices(""+theClass);
	removeClass.setInvertSelection(false);
	removeClass.setInputFormat(train);
	Instances clusterTrain = Filter.useFilter(train, removeClass);
	clusterer.buildClusterer(clusterTrain);
	ClusterEvaluation ce = new ClusterEvaluation();
	ce.setClusterer(clusterer);
	ce.evaluateClusterer(train);

	return "\n\n=== Clustering stats for training data ===\n\n" + ce.clusterResultsToString();
      }
    }

    /* Output cluster predictions only (for the test data if specified,
       otherwise for the training data */
    if (printClusterAssignments) {
      return  printClusterings(clusterer, train, testFileName, attributesToOutput);
    }

    text.append(clusterer.toString());
    text.append("\n\n=== Clustering stats for training data ===\n\n"
      + printClusterStats(clusterer, data));

//    if (testFileName.length() != 0) {
      text.append("\n\n=== Clustering stats for testing data ===\n\n"
		  + printClusterStats(clusterer, test));
//    }

    if ((clusterer instanceof DistributionClusterer) &&
	(doXval == true) &&
	(testFileName.length() == 0) &&
	(objectInputFileName.length() == 0)) {
      // cross validate the log likelihood on the training data
      random = new Random(seed);
      random.setSeed(seed);
      train.randomize(random);
      text.append(crossValidateModel(clusterer.getClass().getName()
				     , train, folds, savedOptions));
    }

    // Save the clusterer if an object output file is provided
    if (objectOutputFileName.length() != 0) {
      objectOutputStream.writeObject(clusterer);
      objectOutputStream.flush();
      objectOutputStream.close();
    }

    return  text.toString();
  }

  /**
   * Performs a cross-validation
   * for a distribution clusterer on a set of instances.
   *
   * @param clustererString a string naming the class of the clusterer
   * @param data the data on which the cross-validation is to be
   * performed
   * @param numFolds the number of folds for the cross-validation
   * @param options the options to the clusterer
   * @return a string containing the cross validated log likelihood
   * @exception Exception if a clusterer could not be generated
   */
  public static String crossValidateModel (String clustererString,
					   Instances data,
					   int numFolds,
					   String[] options)
    throws Exception
  {
    Clusterer clusterer = null;
    Instances train, test;
    String[] savedOptions = null;
    double foldAv;
    double CvAv = 0.0;
    double[] tempDist;
    StringBuffer CvString = new StringBuffer();

    if (options != null) {
      savedOptions = new String[options.length];
    }

    data = new Instances(data);

    for (int i = 0; i < numFolds; i++) {
      // create clusterer
      try {
	clusterer = (Clusterer)Class.forName(clustererString).newInstance();
      }
      catch (Exception e) {
	throw  new Exception("Can't find class with name "
			     + clustererString + '.');
      }

      if (!(clusterer instanceof DistributionClusterer)) {
	throw  new Exception(clustererString
			     + " must be a distrinbution "
			     + "clusterer.");
      }

      // Save options
      if (options != null) {
	System.arraycopy(options, 0, savedOptions, 0, options.length);
      }

      // Parse options
      if (clusterer instanceof OptionHandler) {
	try {
	  ((OptionHandler)clusterer).setOptions(savedOptions);
	  Utils.checkForRemainingOptions(savedOptions);
	}
	catch (Exception e) {
	  throw  new Exception("Can't parse given options in "
			       + "cross-validation!");
	}
      }

      // Build and test classifier
      train = data.trainCV(numFolds, i);
      clusterer.buildClusterer(train);
      test = data.testCV(numFolds, i);
      foldAv = 0.0;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -