group.java

来自「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」· Java 代码 · 共 449 行
JAVA
449 行
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/**
 * Title: XELOPES Data Mining Library
 * Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
 * Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
 * Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
 * @author Valentine Stepanenko (valentine.stepanenko@zsoft.ru)
 * @author Victor Borichev
 * @version 1.0
 */

package com.prudsys.pdm.Models.Statistics;

import com.prudsys.pdm.Adapters.PmmlVersion20.Array;
import com.prudsys.pdm.Adapters.PmmlVersion20.Counts;
import com.prudsys.pdm.Adapters.PmmlVersion20.DiscrStats;
import com.prudsys.pdm.Adapters.PmmlVersion20.ModelStats;
import com.prudsys.pdm.Adapters.PmmlVersion20.NumericInfo;
import com.prudsys.pdm.Adapters.PmmlVersion20.Quantile;
import com.prudsys.pdm.Adapters.PmmlVersion20.UnivariateStats;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningTreeNode;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Models.Statistics.Algorithms.Statistics;

/**
  * Calculate statistics for one group.
  *
  * The special group types are subclasses from this class.
  * Namely they are
  * AllGroup (no grouping),
  * CategoricalGroup (categorical grouping attribute),
  * NumericGroup (numeric grouping attribute),
  * TimeGroup (special numeric attribute time).
  *
  * The Group class extends MiningTreeNode for allowing
  * multidimensional grouping: OLAP.
  */
public abstract class Group extends MiningTreeNode {

  public MiningDataSpecification metaData;
  int groupAttr;
//  Group parent;
//  Group[] children;
  long count, missCount;
  // categorical values
  java.util.Hashtable categories;
  // univariate statistics
  double sum, min, max, mean;
  double quart25,quart50,quart75;
  double variance,standart,skewness,excess,range,quartRange,var2,var3,var4,varcoeff,meanmed;
  // multivariate statistics
  boolean meanCounted;
  double xmean,ymean,xsum,ysum,xsum2,ysum2,xysum;
  double correlation;
  java.util.Hashtable contingency;
  java.util.Vector values;

  public Group(Group parent, int groupAttr) {
    this.parent = parent;
    this.groupAttr = groupAttr;
    leaf = true;
  }

  public void setChildren(Group[] children) {
    this.children = children;
    for(int i=0;i<children.length;i++)
      children[i].parent = this;
    leaf = false;
  }/*
  void setParent(Group parent) {
    this.parent = parent;
  }*/
  public int getGroupAttribute() {
    return groupAttr;
  }/*
  public int getChildCount() {
    return children.length;
  }
  public Group getChildAt(int index) {
    return children[index];
  }
  public Group getParent() {
    return parent;
  }
  public boolean isLeaf() {
    if(children==null||children.length == 0) return true;
    else return false;
  }*/

  public void initSimpleStat() {
    count = missCount = 0;
    sum = 0.;
    mean = Double.NaN;
    min = Double.MAX_VALUE;
    max = -Double.MAX_VALUE;
    categories = null;
    if(children!=null&&children.length!=0)
    {
      for(int i=0;i<children.length;i++)
        ((Group)children[i]).initSimpleStat();
    }
  }

  public void initAdvancedStat() {
    quart25 = quart50 = quart75 = Double.NaN;
    variance = var2 = var3 = var4 = 0.;
    skewness = excess = standart = varcoeff = meanmed = Double.NaN;
    if(children!=null&&children.length!=0)
    {
      for(int i=0;i<children.length;i++)
        ((Group)children[i]).initAdvancedStat();
    }
  }

  public void initMultiStatNum() {
    meanCounted = false;
    count = 0;
    xmean = ymean = Double.NaN;
    xsum = ysum = xsum2 = ysum2 = xysum = 0.;
    correlation = Double.NaN;
    if(children!=null&&children.length!=0)
    {
      for(int i=0;i<children.length;i++)
        ((Group)children[i]).initMultiStatNum();
    }
  }

  public boolean processVector(MiningVector vec, double statValue, int attrType) {
    if(notInGroup(vec)) return false;
    count++;
    if(!Double.isNaN(statValue))
    {
      if(attrType==Statistics.NUMERIC)
      {
        sum += statValue;
        if(statValue<min) min = statValue;
        if(statValue>max) max = statValue;
      }
      else
      {
        if(categories == null) categories = new java.util.Hashtable();
        Double category = new Double(statValue);
        Long counter = (Long)categories.get(category);
        if(counter == null) categories.put(category,new Long(1));
        else categories.put(category,new Long(counter.longValue()+1));
      }
    }
    else missCount++;
    if(children!=null&&children.length!=0) {
      for(int i=0;i<children.length;i++)
        if(((Group)children[i]).processVector(vec,statValue,attrType)) return true;
      return false;
    }
    return true;
  }

  public void computeSimpleStat() {
    mean = sum/(double)count;
    range = max - min;
    if(children!=null&&children.length!=0)
      for(int i=0;i<children.length;i++)
        ((Group)children[i]).computeSimpleStat();
  }

  public boolean advancedProcessVector(MiningVector vec, double statValue, int attrType) {
    if(notInGroup(vec)) return false;
    if(values == null) values = new java.util.Vector();
    values.add(new Double(statValue));
    if(attrType==Statistics.CATEGORICAL)
    {
      double diff = statValue - mean;
      double diff2 = diff*diff;
      double diff3 = diff2*diff;
      double diff4 = diff3*diff;
      variance += diff;
      var2 += diff2;
      var3 += diff3;
      var4 += diff4;
    }
    if(children!=null&&children.length!=0) {
      for(int i=0;i<children.length;i++)
        if(((Group)children[i]).advancedProcessVector(vec,statValue,attrType)) return true;
      return false;
    }
    return true;
  }

  public void computeAdvancedStat() {
    if(values!=null)
    {
      int size = values.size();
      double[] vals = new double[size];
      for(int i=0;i<size;i++) vals[i] = ((Double)values.get(i)).doubleValue();
      java.util.Arrays.sort(vals);
      size--;
      quart25 = vals[size/4];
      quart50 = vals[size/2];
      quart75 = vals[size*3/4];
      quartRange = quart75 - quart25;
    }
    double N = (double)count;
    variance = variance/(N-1.);
    standart = java.lang.Math.sqrt(variance);
    double sk = var2/N;
    skewness = var3/(N*java.lang.Math.sqrt(sk*sk*sk));
    varcoeff = standart/mean;
    values = null;
    if(children!=null&&children.length!=0)
      for(int i=0;i<children.length;i++)
        ((Group)children[i]).computeAdvancedStat();
  }

  public boolean multiProcessVectorCat(MiningVector vec, double x, double y) {
   if(notInGroup(vec)) return false;
   if(contingency == null) contingency = new java.util.Hashtable();
   ContingencyEntry entry = new ContingencyEntry(x,y);
   ContingencyEntry old = (ContingencyEntry)contingency.get(entry);
   if(old!=null)
   {
     old.increment();
   }
   else
   {
     entry.increment();
     contingency.put(entry,entry);
   }
   if(children!=null&&children.length!=0) {
      for(int i=0;i<children.length;i++)
        if(((Group)children[i]).multiProcessVectorCat(vec,x,y)) return true;
      return false;
   }
   return true;
  }

  public boolean multiProcessVectorNum(MiningVector vec, double x, double y) {
   if(notInGroup(vec)) return false;
   count++;
   xsum += x; ysum += y;
   if(children!=null&&children.length!=0) {
      for(int i=0;i<children.length;i++)
        if(((Group)children[i]).multiProcessVectorNum(vec,x,y)) return true;
      return false;
   }
   return true;
  }

  public void computeMultiNum() {
    if(!meanCounted)
    {
      xmean = xsum / (double)count;
      ymean = ysum / (double)count;
      meanCounted = true;
    }
    else
    {
      correlation = xysum / Math.sqrt(xsum2*ysum2);
    }
    if(children!=null&&children.length!=0)
      for(int i=0;i<children.length;i++)
        ((Group)children[i]).computeMultiNum();
  }

  public boolean multiProcessVectorNum2(MiningVector vec, double x, double y) {
   if(notInGroup(vec)) return false;
   double xm = x - xmean;
   double ym = y - ymean;
   xysum += xm*ym;
   xsum2 += xm*xm; ysum2 += ym*ym;
   if(children!=null&&children.length!=0) {
      for(int i=0;i<children.length;i++)
        if(((Group)children[i]).multiProcessVectorNum2(vec,x,y)) return true;
      return false;
   }
   return true;
  }

  public long getCount() {
    return count;
  }
  // univariate
  public double getMean() {
    return mean;
  }
  public double getMin() {
    return min;
  }
  public double getMax() {
    return max;
  }
  public double getRange() {
    return range;
  }
  public double getSum() {
    return sum;
  }
  public double getQuart25() {
    return quart25;
  }
  public double getQuart50() {
    return quart50;
  }
  public double getQuart75() {
    return quart75;
  }
  public double getVariance() {
    return variance;
  }
  public double getStandart() {
    return standart;
  }
  public double getSkewness() {
    return skewness;
  }
  public double getExcess() {
    return excess;
  }
  public double getQuartRange() {
    return quartRange;
  }
  public double getVarCoeff() {
    return varcoeff;
  }
  // multivariate
  public double getCorrelation() {
    return correlation;
  }
  public java.util.Enumeration getContingencyTable() {
    if(contingency!=null) return contingency.keys();
    return null;
  }

  public ModelStats createPmmlObject(MiningAttribute attribute, int featType) {

    ModelStats stat = new ModelStats();
    UnivariateStats uni = new UnivariateStats();
    stat.addUnivariateStats(uni);
    uni.setField(attribute.getName());
    if(featType == 1) // continuous
    {
      NumericInfo num = new NumericInfo();
      num.setMinimum(Double.toString(min));
      num.setMaximum(Double.toString(max));
      num.setMean(Double.toString(mean));
      num.setMedian(Double.toString(quart50));
      num.setStandardDeviation(Double.toString(standart));
      num.setInterQuartileRange(Double.toString(quartRange));
      Quantile quant = new Quantile();
      quant.setQuantileLimit("25");
      quant.setQuantileValue(Double.toString(quart25));
      num.addQuantile(quant);
      quant = new Quantile();
      quant.setQuantileLimit("75");
      quant.setQuantileValue(Double.toString(quart75));
      num.addQuantile(quant);
      uni.setNumericInfo(num);
    }
    else  // categorical
    {
      Counts counts = new Counts();
      counts.setTotalFreq(Long.toString(count));
      counts.setMissingFreq(Long.toString(missCount));
      counts.setInvalidFreq("0");
      uni.setCounts(counts);
      DiscrStats discr = new DiscrStats();
      int size = categories.size();
      long[] counters = new long[size];
      double[] vals = new double[size];
      int i=0;
      java.util.Enumeration em = categories.keys();
      while(em.hasMoreElements())
      {
        Double key = (Double)em.nextElement();
        vals[i] = key.doubleValue();
        counters[i++] = ((Long)categories.get(key)).longValue();
      }
      CategoricalAttribute categorical = (CategoricalAttribute)attribute;
      Category category;
      long max = 0;
      int maxVal = 0;
      StringBuffer strBuffer = new StringBuffer();
      StringBuffer numBuffer = new StringBuffer();
      for(i=0;i<size;i++)
      {
        if(max < counters[i])
        {
          max = counters[i];
          maxVal = i;
        }
        category = (Category)categorical.getCategory(vals[i]);
        strBuffer.append( "\'" + category.getDisplayValue() + "\'" +" ");
        numBuffer.append(Long.toString(counters[i])+" ");
      }
      category = (Category)categorical.getCategory(vals[maxVal]);
      discr.setModalValue(category.getDisplayValue());
      Array strArr = new Array(strBuffer.toString());
      strArr.setN(Integer.toString(size));
      strArr.setType("string");
      Array numArr = new Array(numBuffer.toString());
      numArr.setN(Integer.toString(size));
      numArr.setType("int");
      discr.setArray(strArr);
      discr.setArray1(numArr);
      uni.setDiscrStats(discr);
    }
    return stat;
  }


  public String toString() {
    return "Group with "+children.length+" children";
  }

  /**
   * Checks group for equality just by using the group attributes.
   *
   * @param obj object to be compared with
   * @return true is equal, otherwise false
   */
  public boolean equals(Object obj) {
    Group group = (Group)obj;
    if(group.groupAttr != groupAttr) return false;
    return true;
  }

  protected abstract boolean notInGroup(MiningVector vec);

  public abstract Group makeCopy();

}
group.java - 源码说明

本页面展示了「一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码」中的 group.java 源码文件，采用 Java 编程语言编写，共 449 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与ALPHAMINERR相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?