📄 simplestats.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Valentine Stepanenko (ValentineStepanenko@zsoft.ru)
* @author Victor Borichev
* @version 1.0
*/
package com.prudsys.pdm.Models.Statistics;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Utils.IntVector;
/**
* Calculates some simple statistical characteristics. It is
* not based on the usual XELOPES algorithm architecture (MingModel, MiningSettings,
* MiningAlgorithm) but simpler to use.
*/
public class SimpleStats
{
// -----------------------------------------------------------------------
// Constants representing the statistical characteristics to calculate
// -----------------------------------------------------------------------
/** Sum. */
public static final String STAT_SUM = "sum";
/** Minimum. */
public static final String STAT_MIN = "min";
/** Maximum. */
public static final String STAT_MAX = "max";
/** Mean. */
public static final String STAT_MEAN = "mean";
/** Mode. */
public static final String STAT_MODE = "mode";
/** Count. */
public static final String STAT_COUNT = "count";
/** Missing Count. */
public static final String STAT_MISSING_COUNT = "missingCount";
/** Entropy. */
public static final String STAT_ENTROPY = "entropy";
//-----Advanced statistics requiring more than one scan through data-----
/** Variance. */
public static final String STAT_VARIANCE = "variance";
/** Standard deviation. */
public static final String STAT_DEVIATION = "standardDeviation";
// -----------------------------------------------------------------------
// Variables declarations
// -----------------------------------------------------------------------
/** Source stream of statistics calculation. */
private MiningInputStream inputStream = null;
/** Array of sums for numeric attributes. Zero for categorical attributes. */
private double sums[];
/** Array of minima for numeric attributes. Zero for categorical attributes. */
private double mins[];
/** Array of maxima for numeric attributes. Zero for categorical attributes. */
private double maxs[];
/** Array of mean values for numeric and modes for categorical attributes. */
private double meanModes[];
/** Number of all counts including missing values. */
private int nTotalCount = 0;
/** Array of counts. Missing values are not counted. */
private int nCounts[];
/** Array of entropies for categorical attributes. Zero for numeric attributes. */
private double entropies[];
//-----Advanced statistics requiring more than two scans through data-----
/** Array of variances for numeric attributes. Zero for categorical attributes. */
private double variances[];
// -----------------------------------------------------------------------
// Constructor
// -----------------------------------------------------------------------
/**
* Empty constructor.
*/
public SimpleStats()
{
}
// -----------------------------------------------------------------------
// Getter and setter methods
// -----------------------------------------------------------------------
/**
* Returns mining input stream for calculations.
*
* @return mining input stream for calculations
*/
public MiningInputStream getInputStream()
{
return inputStream;
}
/**
* Sets mining input stream for calculations.
*
* @param inputStream mining input stream for calculations
*/
public void setInputStream(MiningInputStream inputStream)
{
this.inputStream = inputStream;
}
/**
* Returns statistics value for given mining attribute and name
* of statistical characteristics.
*
* @param statAtt mining attribute for statistics
* @param statName name of statistical characteristics
* @return value of statistical characteristics
* @throws MiningException wrong attribute or statistics not available
*/
public double getCalculatedValue(MiningAttribute statAtt, String statName)
throws MiningException {
int ind = inputStream.getMetaData().getAttributeIndex(statAtt);
if (ind == -1)
throw new MiningException("mining attribute does not exist");
if (statName.equals(STAT_SUM))
return sums[ind];
if (statName.equals(STAT_MIN))
return mins[ind];
if (statName.equals(STAT_MAX))
return maxs[ind];
if (statName.equals(STAT_MEAN) || statName.equals(STAT_MODE))
return meanModes[ind];
if (statName.equals(STAT_COUNT))
return nTotalCount;
if (statName.equals(STAT_MISSING_COUNT))
return (nTotalCount - nCounts[ind]);
if (statName.equals(STAT_ENTROPY))
return entropies[ind];
// -- Advanced stats:
if (statName.equals(STAT_VARIANCE))
return variances[ind];
if (statName.equals(STAT_DEVIATION))
return Math.sqrt(variances[ind]);
throw new MiningException("unknown identifier of statistical characteristics");
}
// -----------------------------------------------------------------------
// Statistics calculations
// -----------------------------------------------------------------------
/**
* Runs calculation of simple statistical characteristics of all
* attributes.
*
* @exception MiningException can't calculate statistics
*/
public void runCalculation() throws MiningException {
runCalculation(false);
}
/**
* Runs calculation of statistical characteristics of all attributes.
* Can also calculate advanced statistical characteristics, i.e.
* characteristics which require more than one scan through the data.
*
* @param advanced also calculate advanced statistical characteristics
* @exception MiningException can't calculate statistics
*/
public void runCalculation(boolean advanced) throws MiningException {
// Abbreviations:
MiningDataSpecification metaData = inputStream.getMetaData();
int nAtt = metaData.getAttributesNumber();
// Initializations:
sums = new double[nAtt];
mins = new double[nAtt];
maxs = new double[nAtt];
meanModes = new double[nAtt];
entropies = new double[nAtt];
nCounts = new int[nAtt];
nTotalCount = 0;
IntVector[] catNumbs = new IntVector[nAtt];
for (int i = 0; i < nAtt; i++) {
if (metaData.getMiningAttribute(i) instanceof CategoricalAttribute) {
int nCateg = ((CategoricalAttribute) metaData.getMiningAttribute(i) ).getCategoriesNumber();
catNumbs[i] = new IntVector(nCateg);
for (int j = 0; j < nCateg; j++)
catNumbs[i].addElement(0);
};
};
// Data scan:
inputStream.reset();
while (inputStream.next()) {
MiningVector miningVector = inputStream.read();
for (int i = 0; i < nAtt; i++) {
double value = miningVector.getValue(i);
if (metaData.getMiningAttribute(i) instanceof NumericAttribute) {
if ( ! Category.isMissingValue(value) ) {
// Sum:
sums[i] = sums[i] + value;
// Don't use initial 0-values for comparison of min and max:
if (nCounts[i] == 0) {
mins[i] = value;
maxs[i] = value;
};
// Min and max:
if (value < mins[i])
mins[i] = value;
if (value > maxs[i])
maxs[i] = value;
// Count:
nCounts[i] = nCounts[i] + 1;
};
}
else {
if ( ! Category.isMissingValue(value) ) {
int j = (int) value;
if (j == catNumbs[i].size())
catNumbs[i].addElement(1); // new category just get
else
catNumbs[i].setElementAt(catNumbs[i].IntegerAt(j) + 1, j);
nCounts[i] = nCounts[i] + 1;
};
};
};
nTotalCount = nTotalCount + 1;
};
// Calculate means, modes, and entropies:
for (int i = 0; i < nAtt; i++) {
if (metaData.getMiningAttribute(i) instanceof NumericAttribute) {
if (nCounts[i] > 0)
meanModes[i] = sums[i] / nCounts[i];
else
meanModes[i] = 0.0;
}
else {
int max = 0;
int ind = 0;
double entropy = 0.0;
for (int j = 0; j < catNumbs[i].size(); j++) {
int ncat = catNumbs[i].IntegerAt(j);
if (ncat > max) {
max = ncat;
ind = j;
};
double p = ((double) ncat) / nCounts[i];
entropy = entropy - p*Math.log(p) / Math.log(2);
}
meanModes[i] = ind;
entropies[i] = entropy;
};
};
inputStream.reset();
if (! advanced) return;
//-----Advanced statistics requiring more than one scan through data-----
// Initializations:
variances = new double[nAtt];
// Second data scan:
while (inputStream.next()) {
MiningVector miningVector = inputStream.read();
for (int i = 0; i < nAtt; i++) {
double value = miningVector.getValue(i);
if (metaData.getMiningAttribute(i)instanceof NumericAttribute) {
if (!Category.isMissingValue(value)) {
double delta = value - meanModes[i];
variances[i] = variances[i] + delta * delta;
}
}
}
}
// Calculate variances:
for (int i = 0; i < nAtt; i++) {
if (metaData.getMiningAttribute(i) instanceof NumericAttribute) {
// Variances:
if (nCounts[i] > 1)
variances[i] = variances[i] / nCounts[i];
else
variances[i] = 0.0;
}
}
inputStream.reset();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -