📄 group.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Valentine Stepanenko (valentine.stepanenko@zsoft.ru)
* @author Victor Borichev
* @version 1.0
*/
package com.prudsys.pdm.Models.Statistics;
import com.prudsys.pdm.Adapters.PmmlVersion20.Array;
import com.prudsys.pdm.Adapters.PmmlVersion20.Counts;
import com.prudsys.pdm.Adapters.PmmlVersion20.DiscrStats;
import com.prudsys.pdm.Adapters.PmmlVersion20.ModelStats;
import com.prudsys.pdm.Adapters.PmmlVersion20.NumericInfo;
import com.prudsys.pdm.Adapters.PmmlVersion20.Quantile;
import com.prudsys.pdm.Adapters.PmmlVersion20.UnivariateStats;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningTreeNode;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Models.Statistics.Algorithms.Statistics;
/**
* Calculate statistics for one group.
*
* The special group types are subclasses from this class.
* Namely they are
* AllGroup (no grouping),
* CategoricalGroup (categorical grouping attribute),
* NumericGroup (numeric grouping attribute),
* TimeGroup (special numeric attribute time).
*
* The Group class extends MiningTreeNode for allowing
* multidimensional grouping: OLAP.
*/
public abstract class Group extends MiningTreeNode {
public MiningDataSpecification metaData;
int groupAttr;
// Group parent;
// Group[] children;
long count, missCount;
// categorical values
java.util.Hashtable categories;
// univariate statistics
double sum, min, max, mean;
double quart25,quart50,quart75;
double variance,standart,skewness,excess,range,quartRange,var2,var3,var4,varcoeff,meanmed;
// multivariate statistics
boolean meanCounted;
double xmean,ymean,xsum,ysum,xsum2,ysum2,xysum;
double correlation;
java.util.Hashtable contingency;
java.util.Vector values;
public Group(Group parent, int groupAttr) {
this.parent = parent;
this.groupAttr = groupAttr;
leaf = true;
}
public void setChildren(Group[] children) {
this.children = children;
for(int i=0;i<children.length;i++)
children[i].parent = this;
leaf = false;
}/*
void setParent(Group parent) {
this.parent = parent;
}*/
public int getGroupAttribute() {
return groupAttr;
}/*
public int getChildCount() {
return children.length;
}
public Group getChildAt(int index) {
return children[index];
}
public Group getParent() {
return parent;
}
public boolean isLeaf() {
if(children==null||children.length == 0) return true;
else return false;
}*/
public void initSimpleStat() {
count = missCount = 0;
sum = 0.;
mean = Double.NaN;
min = Double.MAX_VALUE;
max = -Double.MAX_VALUE;
categories = null;
if(children!=null&&children.length!=0)
{
for(int i=0;i<children.length;i++)
((Group)children[i]).initSimpleStat();
}
}
public void initAdvancedStat() {
quart25 = quart50 = quart75 = Double.NaN;
variance = var2 = var3 = var4 = 0.;
skewness = excess = standart = varcoeff = meanmed = Double.NaN;
if(children!=null&&children.length!=0)
{
for(int i=0;i<children.length;i++)
((Group)children[i]).initAdvancedStat();
}
}
public void initMultiStatNum() {
meanCounted = false;
count = 0;
xmean = ymean = Double.NaN;
xsum = ysum = xsum2 = ysum2 = xysum = 0.;
correlation = Double.NaN;
if(children!=null&&children.length!=0)
{
for(int i=0;i<children.length;i++)
((Group)children[i]).initMultiStatNum();
}
}
public boolean processVector(MiningVector vec, double statValue, int attrType) {
if(notInGroup(vec)) return false;
count++;
if(!Double.isNaN(statValue))
{
if(attrType==Statistics.NUMERIC)
{
sum += statValue;
if(statValue<min) min = statValue;
if(statValue>max) max = statValue;
}
else
{
if(categories == null) categories = new java.util.Hashtable();
Double category = new Double(statValue);
Long counter = (Long)categories.get(category);
if(counter == null) categories.put(category,new Long(1));
else categories.put(category,new Long(counter.longValue()+1));
}
}
else missCount++;
if(children!=null&&children.length!=0) {
for(int i=0;i<children.length;i++)
if(((Group)children[i]).processVector(vec,statValue,attrType)) return true;
return false;
}
return true;
}
public void computeSimpleStat() {
mean = sum/(double)count;
range = max - min;
if(children!=null&&children.length!=0)
for(int i=0;i<children.length;i++)
((Group)children[i]).computeSimpleStat();
}
public boolean advancedProcessVector(MiningVector vec, double statValue, int attrType) {
if(notInGroup(vec)) return false;
if(values == null) values = new java.util.Vector();
values.add(new Double(statValue));
if(attrType==Statistics.CATEGORICAL)
{
double diff = statValue - mean;
double diff2 = diff*diff;
double diff3 = diff2*diff;
double diff4 = diff3*diff;
variance += diff;
var2 += diff2;
var3 += diff3;
var4 += diff4;
}
if(children!=null&&children.length!=0) {
for(int i=0;i<children.length;i++)
if(((Group)children[i]).advancedProcessVector(vec,statValue,attrType)) return true;
return false;
}
return true;
}
public void computeAdvancedStat() {
if(values!=null)
{
int size = values.size();
double[] vals = new double[size];
for(int i=0;i<size;i++) vals[i] = ((Double)values.get(i)).doubleValue();
java.util.Arrays.sort(vals);
size--;
quart25 = vals[size/4];
quart50 = vals[size/2];
quart75 = vals[size*3/4];
quartRange = quart75 - quart25;
}
double N = (double)count;
variance = variance/(N-1.);
standart = java.lang.Math.sqrt(variance);
double sk = var2/N;
skewness = var3/(N*java.lang.Math.sqrt(sk*sk*sk));
varcoeff = standart/mean;
values = null;
if(children!=null&&children.length!=0)
for(int i=0;i<children.length;i++)
((Group)children[i]).computeAdvancedStat();
}
public boolean multiProcessVectorCat(MiningVector vec, double x, double y) {
if(notInGroup(vec)) return false;
if(contingency == null) contingency = new java.util.Hashtable();
ContingencyEntry entry = new ContingencyEntry(x,y);
ContingencyEntry old = (ContingencyEntry)contingency.get(entry);
if(old!=null)
{
old.increment();
}
else
{
entry.increment();
contingency.put(entry,entry);
}
if(children!=null&&children.length!=0) {
for(int i=0;i<children.length;i++)
if(((Group)children[i]).multiProcessVectorCat(vec,x,y)) return true;
return false;
}
return true;
}
public boolean multiProcessVectorNum(MiningVector vec, double x, double y) {
if(notInGroup(vec)) return false;
count++;
xsum += x; ysum += y;
if(children!=null&&children.length!=0) {
for(int i=0;i<children.length;i++)
if(((Group)children[i]).multiProcessVectorNum(vec,x,y)) return true;
return false;
}
return true;
}
public void computeMultiNum() {
if(!meanCounted)
{
xmean = xsum / (double)count;
ymean = ysum / (double)count;
meanCounted = true;
}
else
{
correlation = xysum / Math.sqrt(xsum2*ysum2);
}
if(children!=null&&children.length!=0)
for(int i=0;i<children.length;i++)
((Group)children[i]).computeMultiNum();
}
public boolean multiProcessVectorNum2(MiningVector vec, double x, double y) {
if(notInGroup(vec)) return false;
double xm = x - xmean;
double ym = y - ymean;
xysum += xm*ym;
xsum2 += xm*xm; ysum2 += ym*ym;
if(children!=null&&children.length!=0) {
for(int i=0;i<children.length;i++)
if(((Group)children[i]).multiProcessVectorNum2(vec,x,y)) return true;
return false;
}
return true;
}
public long getCount() {
return count;
}
// univariate
public double getMean() {
return mean;
}
public double getMin() {
return min;
}
public double getMax() {
return max;
}
public double getRange() {
return range;
}
public double getSum() {
return sum;
}
public double getQuart25() {
return quart25;
}
public double getQuart50() {
return quart50;
}
public double getQuart75() {
return quart75;
}
public double getVariance() {
return variance;
}
public double getStandart() {
return standart;
}
public double getSkewness() {
return skewness;
}
public double getExcess() {
return excess;
}
public double getQuartRange() {
return quartRange;
}
public double getVarCoeff() {
return varcoeff;
}
// multivariate
public double getCorrelation() {
return correlation;
}
public java.util.Enumeration getContingencyTable() {
if(contingency!=null) return contingency.keys();
return null;
}
public ModelStats createPmmlObject(MiningAttribute attribute, int featType) {
ModelStats stat = new ModelStats();
UnivariateStats uni = new UnivariateStats();
stat.addUnivariateStats(uni);
uni.setField(attribute.getName());
if(featType == 1) // continuous
{
NumericInfo num = new NumericInfo();
num.setMinimum(Double.toString(min));
num.setMaximum(Double.toString(max));
num.setMean(Double.toString(mean));
num.setMedian(Double.toString(quart50));
num.setStandardDeviation(Double.toString(standart));
num.setInterQuartileRange(Double.toString(quartRange));
Quantile quant = new Quantile();
quant.setQuantileLimit("25");
quant.setQuantileValue(Double.toString(quart25));
num.addQuantile(quant);
quant = new Quantile();
quant.setQuantileLimit("75");
quant.setQuantileValue(Double.toString(quart75));
num.addQuantile(quant);
uni.setNumericInfo(num);
}
else // categorical
{
Counts counts = new Counts();
counts.setTotalFreq(Long.toString(count));
counts.setMissingFreq(Long.toString(missCount));
counts.setInvalidFreq("0");
uni.setCounts(counts);
DiscrStats discr = new DiscrStats();
int size = categories.size();
long[] counters = new long[size];
double[] vals = new double[size];
int i=0;
java.util.Enumeration em = categories.keys();
while(em.hasMoreElements())
{
Double key = (Double)em.nextElement();
vals[i] = key.doubleValue();
counters[i++] = ((Long)categories.get(key)).longValue();
}
CategoricalAttribute categorical = (CategoricalAttribute)attribute;
Category category;
long max = 0;
int maxVal = 0;
StringBuffer strBuffer = new StringBuffer();
StringBuffer numBuffer = new StringBuffer();
for(i=0;i<size;i++)
{
if(max < counters[i])
{
max = counters[i];
maxVal = i;
}
category = (Category)categorical.getCategory(vals[i]);
strBuffer.append( "\'" + category.getDisplayValue() + "\'" +" ");
numBuffer.append(Long.toString(counters[i])+" ");
}
category = (Category)categorical.getCategory(vals[maxVal]);
discr.setModalValue(category.getDisplayValue());
Array strArr = new Array(strBuffer.toString());
strArr.setN(Integer.toString(size));
strArr.setType("string");
Array numArr = new Array(numBuffer.toString());
numArr.setN(Integer.toString(size));
numArr.setType("int");
discr.setArray(strArr);
discr.setArray1(numArr);
uni.setDiscrStats(discr);
}
return stat;
}
public String toString() {
return "Group with "+children.length+" children";
}
/**
* Checks group for equality just by using the group attributes.
*
* @param obj object to be compared with
* @return true is equal, otherwise false
*/
public boolean equals(Object obj) {
Group group = (Group)obj;
if(group.groupAttr != groupAttr) return false;
return true;
}
protected abstract boolean notInGroup(MiningVector vec);
public abstract Group makeCopy();
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -