⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 entropyoutlier.java

📁 查找孤立点 JAVA代码
💻 JAVA
字号:
package EntropyOutlier;

/**
 * <p>Title: An Optimization Model for Outlier Detection in Categorical Data Set</p>
 * <p>Description: Parition the data set into two parts: 1)outliers and 2) normal data
 * <p>To this goal, the normal data should have minimal "disorder", i.e., its entropy is minimized
 * <p>Hence, we use a iterative clustering-like algorithm to find feasible solution</p>
 * <p>Copyright: Copyright (c) 2005-03-28</p>
 * <p>Company: HIT</p>
 * @author Zengyou He
 * @version 1.0
 */

import java.io.*;
import java.util.*;

public class EntropyOutlier {

  private String[][] data;
  private int colCount, rowCount, k;

  private Hashtable[] density;
  private int[] outlierSet;
  private double sum;
  private int iteration;
  private int swap;

  public EntropyOutlier(int k, String path)
  {
    this.k = k;
    outlierSet = new int[k];
    ReadData read = new ReadData(path);
    data = read.readFromDataFile();
    colCount = read.getColCount();
    rowCount = read.getRowCount();
    sum = rowCount -k;                 //!!!!!!!!!
    density = new Hashtable[colCount];
    iteration=0;
    swap=0;

    for (int j = 0; j < colCount; j++) {
      density[j] = new Hashtable();
    }
  }

  public static void main(String[] args)
  {
    double time1 = (double) System.currentTimeMillis();


    String path = "lymphdt";
    EntropyOutlier eo = new EntropyOutlier(100,path);
    //eo.run();
    eo.greedyAlg();

    double time2 = (double) System.currentTimeMillis();
    System.out.println("EOutlier finished in "+ (time2-time1)+" seconds");

  }

  /*  select first k tuple as initial outliers             */
  public void init_outlier_set() {
    for (int i = 0; i < k; i++) {
      data[i][colCount] = "1"; // "1" denotes that it is an outlier
      outlierSet[i] = i;
    }
    for (int i = k; i < rowCount; i++) {
      data[i][colCount] = "0";
      for (int m = 1; m < colCount; m++) {
        Integer itTmp = (Integer) (density[m].get(data[i][m])); //
        if (itTmp != null) {
          int intTmp = ( (Integer) (density[m].get(data[i][m]))).intValue() + 1;
          density[m].put(data[i][m], new Integer(intTmp));
        }
        else {
          density[m].put(data[i][m], new Integer(1));
        }
      }
    }
  }

  public String[][] iterate() {
    boolean hasSwap = true;
    while (hasSwap) {
      hasSwap = false;
      iteration++;
      System.out.println(iteration);

      for (int i = 0; i < rowCount; i++) {
        //System.out.println(i);
        if(data[i][colCount]!="1"){
          double tmp = 1000000;
          int tmpPos = -1;
          for(int j=0;j<k;j++){
            double cur =getChangedObjectValue(outlierSet[j],i);
            if(tmp>cur){
              tmp=cur;
              tmpPos = j;
            }
          }
          if(tmp<0){
            hasSwap = true;
            swap(outlierSet[tmpPos],i,tmpPos);
            swap++;
          }
        }
      }
    }
    return data;
  }

  public double getChangedObjectValue(int outlierIndex,int nonOutlierIndex) {

    double delta = 0;
    for (int m = 1; m < colCount; m++) {

      Integer itTmpO = (Integer) (density[m].get(data[outlierIndex][m]));
      //itTmpNO must not be null!!
      Integer itTmpNO = (Integer) (density[m].get(data[nonOutlierIndex][m]));
      double v_decrease = 0;

      try{
        v_decrease = itTmpNO.intValue() - 1;
      }
      catch(Exception e){
        //System.out.println(e.getMessage());
        System.out.println("===================");
        System.out.println(data[nonOutlierIndex][m]);
        System.out.println(m);
        System.out.println(itTmpNO);
        System.out.println(outlierIndex+"!="+nonOutlierIndex);
        System.out.println(density[m]);
      }

      double v_increase = 1;
      if (itTmpO != null) {
        v_increase = itTmpO.intValue()+1;
      }

      if(v_increase==1){
        delta = delta +(-(v_increase/sum)*Math.log(v_increase/sum));
      }
      else{
        delta = delta +(-(v_increase/sum)*Math.log(v_increase/sum))
            -(-((v_increase-1)/sum)*Math.log((v_increase-1)/sum));
      }

      if(v_decrease==0){
        delta = delta -(-((v_decrease+1)/sum)*Math.log((v_decrease+1)/sum));
      }
      else{
        delta = delta +(-(v_decrease/sum)*Math.log(v_decrease/sum))
            -(-((v_decrease+1)/sum)*Math.log((v_decrease+1)/sum));
      }
    }
    return delta;
  }

  public void swap(int outlierIndex,int nonOutlierIndex,int oSetIndex) {
    for (int m = 1; m < colCount; m++) {
      Integer itTmpO = (Integer) (density[m].get(data[outlierIndex][m]));
      Integer itTmpNO = (Integer) (density[m].get(data[nonOutlierIndex][m]));
      if (itTmpO != null) {
        density[m].put(data[outlierIndex][m], new Integer(itTmpO.intValue() + 1));
      }
      else {
        density[m].put(data[outlierIndex][m], new Integer(1));
      }

      int decreased_value = itTmpNO.intValue()-1;
      //if(decreased_value==0){
      //  density[m].remove(data[nonOutlierIndex][m]);
      //}
      //else{
        density[m].put(data[nonOutlierIndex][m],new Integer(decreased_value));
      //}
    }
     data[nonOutlierIndex][colCount] = "1";
     data[outlierIndex][colCount] = "0";
     outlierSet[oSetIndex] = nonOutlierIndex;
  }

  public void run() {
    //System.out.println(data[0][9]);
    init_outlier_set();
    iterate();
    System.out.println("Number of iterations:"+iteration);
    System.out.println("Number of swaps:"+swap);
    int rareNum = 0;
    for(int i=0;i<k;i++){
      System.out.println(data[outlierSet[i]][0]);
      if(data[outlierSet[i]][0].startsWith("1") || data[outlierSet[i]][0].startsWith("4")){
        rareNum++;
      }
      /*
      if(data[outlierSet[i]][0].startsWith("3") || data[outlierSet[i]][0].startsWith("4") || data[outlierSet[i]][0].startsWith("5") ||
         data[outlierSet[i]][0].startsWith("7") || data[outlierSet[i]][0].startsWith("8") || data[outlierSet[i]][0].startsWith("9") ||
         data[outlierSet[i]][0].startsWith("14") || data[outlierSet[i]][0].startsWith("15")){
        rareNum++;
      }*/


    }
    System.out.println("Number of identified outliers:"+rareNum +" among "+ k +" objects");
  }


  //general grredy algorithm
  public void greedyAlg() {

    //Step 1: Intialization
    for (int i = 0; i < rowCount; i++) {
      data[i][colCount] = "0";
      for (int m = 1; m < colCount; m++) {
        Integer itTmp = (Integer) (density[m].get(data[i][m])); //
        if (itTmp != null) {
          int intTmp = ( (Integer) (density[m].get(data[i][m]))).intValue() + 1;
          density[m].put(data[i][m], new Integer(intTmp));
        }
        else {
          density[m].put(data[i][m], new Integer(1));
        }
      }
    }

    //Step 2: Greedy Procedure
    int counter = 0;
    while (counter<k) {
      counter++;
      int remained = rowCount-counter;
      System.out.println(counter);
      double tmp = 1000000;
      int tmpPos = -1;
      for (int i = 0; i < rowCount; i++) {
        if(data[i][colCount]!="1"){
        //computing the decreased entropy value
        double delta =0;
          for (int m = 1; m < colCount; m++) {
            //itTmpNO must not be null!!
            Integer itTmpNO = (Integer) (density[m].get(data[i][m]));
            double v_decrease = itTmpNO.intValue() - 1;
            if(v_decrease==0){
              delta = delta -(-((v_decrease+1)/remained)*Math.log((v_decrease+1)/remained));
            }
            else{
              delta = delta +(-(v_decrease/remained)*Math.log(v_decrease/remained))
                  -(-((v_decrease+1)/remained)*Math.log((v_decrease+1)/remained));
            }
          }
          if(tmp>delta){
            tmp=delta;
            tmpPos = i;
          }
        }
      }
      data[tmpPos][colCount] = "1";
    }

    //Step 3: Output top-k outliers
    int rareNum = 0;
    for(int i=0;i<rowCount;i++){
      if(data[i][colCount]=="1"){
        System.out.println(data[i][0]);
        if(data[i][0].startsWith("2") || data[i][0].startsWith("2")){
          rareNum++;
        }

         /*if(data[i][0].startsWith("3") || data[i][0].startsWith("4") || data[i][0].startsWith("5") ||
            data[i][0].startsWith("7") || data[i][0].startsWith("8") || data[i][0].startsWith("9") ||
            data[i][0].startsWith("14") || data[i][0].startsWith("15")){
           rareNum++;
         }*/
      }
    }
    System.out.println("Number of identified outliers:"+rareNum +" among "+ k +" objects");

  }


  


}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -