📄 entropyoutlier.java
字号:
package EntropyOutlier;
/**
* <p>Title: An Optimization Model for Outlier Detection in Categorical Data Set</p>
* <p>Description: Parition the data set into two parts: 1)outliers and 2) normal data
* <p>To this goal, the normal data should have minimal "disorder", i.e., its entropy is minimized
* <p>Hence, we use a iterative clustering-like algorithm to find feasible solution</p>
* <p>Copyright: Copyright (c) 2005-03-28</p>
* <p>Company: HIT</p>
* @author Zengyou He
* @version 1.0
*/
import java.io.*;
import java.util.*;
public class EntropyOutlier {
private String[][] data;
private int colCount, rowCount, k;
private Hashtable[] density;
private int[] outlierSet;
private double sum;
private int iteration;
private int swap;
public EntropyOutlier(int k, String path)
{
this.k = k;
outlierSet = new int[k];
ReadData read = new ReadData(path);
data = read.readFromDataFile();
colCount = read.getColCount();
rowCount = read.getRowCount();
sum = rowCount -k; //!!!!!!!!!
density = new Hashtable[colCount];
iteration=0;
swap=0;
for (int j = 0; j < colCount; j++) {
density[j] = new Hashtable();
}
}
public static void main(String[] args)
{
double time1 = (double) System.currentTimeMillis();
String path = "lymphdt";
EntropyOutlier eo = new EntropyOutlier(100,path);
//eo.run();
eo.greedyAlg();
double time2 = (double) System.currentTimeMillis();
System.out.println("EOutlier finished in "+ (time2-time1)+" seconds");
}
/* select first k tuple as initial outliers */
public void init_outlier_set() {
for (int i = 0; i < k; i++) {
data[i][colCount] = "1"; // "1" denotes that it is an outlier
outlierSet[i] = i;
}
for (int i = k; i < rowCount; i++) {
data[i][colCount] = "0";
for (int m = 1; m < colCount; m++) {
Integer itTmp = (Integer) (density[m].get(data[i][m])); //
if (itTmp != null) {
int intTmp = ( (Integer) (density[m].get(data[i][m]))).intValue() + 1;
density[m].put(data[i][m], new Integer(intTmp));
}
else {
density[m].put(data[i][m], new Integer(1));
}
}
}
}
public String[][] iterate() {
boolean hasSwap = true;
while (hasSwap) {
hasSwap = false;
iteration++;
System.out.println(iteration);
for (int i = 0; i < rowCount; i++) {
//System.out.println(i);
if(data[i][colCount]!="1"){
double tmp = 1000000;
int tmpPos = -1;
for(int j=0;j<k;j++){
double cur =getChangedObjectValue(outlierSet[j],i);
if(tmp>cur){
tmp=cur;
tmpPos = j;
}
}
if(tmp<0){
hasSwap = true;
swap(outlierSet[tmpPos],i,tmpPos);
swap++;
}
}
}
}
return data;
}
public double getChangedObjectValue(int outlierIndex,int nonOutlierIndex) {
double delta = 0;
for (int m = 1; m < colCount; m++) {
Integer itTmpO = (Integer) (density[m].get(data[outlierIndex][m]));
//itTmpNO must not be null!!
Integer itTmpNO = (Integer) (density[m].get(data[nonOutlierIndex][m]));
double v_decrease = 0;
try{
v_decrease = itTmpNO.intValue() - 1;
}
catch(Exception e){
//System.out.println(e.getMessage());
System.out.println("===================");
System.out.println(data[nonOutlierIndex][m]);
System.out.println(m);
System.out.println(itTmpNO);
System.out.println(outlierIndex+"!="+nonOutlierIndex);
System.out.println(density[m]);
}
double v_increase = 1;
if (itTmpO != null) {
v_increase = itTmpO.intValue()+1;
}
if(v_increase==1){
delta = delta +(-(v_increase/sum)*Math.log(v_increase/sum));
}
else{
delta = delta +(-(v_increase/sum)*Math.log(v_increase/sum))
-(-((v_increase-1)/sum)*Math.log((v_increase-1)/sum));
}
if(v_decrease==0){
delta = delta -(-((v_decrease+1)/sum)*Math.log((v_decrease+1)/sum));
}
else{
delta = delta +(-(v_decrease/sum)*Math.log(v_decrease/sum))
-(-((v_decrease+1)/sum)*Math.log((v_decrease+1)/sum));
}
}
return delta;
}
public void swap(int outlierIndex,int nonOutlierIndex,int oSetIndex) {
for (int m = 1; m < colCount; m++) {
Integer itTmpO = (Integer) (density[m].get(data[outlierIndex][m]));
Integer itTmpNO = (Integer) (density[m].get(data[nonOutlierIndex][m]));
if (itTmpO != null) {
density[m].put(data[outlierIndex][m], new Integer(itTmpO.intValue() + 1));
}
else {
density[m].put(data[outlierIndex][m], new Integer(1));
}
int decreased_value = itTmpNO.intValue()-1;
//if(decreased_value==0){
// density[m].remove(data[nonOutlierIndex][m]);
//}
//else{
density[m].put(data[nonOutlierIndex][m],new Integer(decreased_value));
//}
}
data[nonOutlierIndex][colCount] = "1";
data[outlierIndex][colCount] = "0";
outlierSet[oSetIndex] = nonOutlierIndex;
}
public void run() {
//System.out.println(data[0][9]);
init_outlier_set();
iterate();
System.out.println("Number of iterations:"+iteration);
System.out.println("Number of swaps:"+swap);
int rareNum = 0;
for(int i=0;i<k;i++){
System.out.println(data[outlierSet[i]][0]);
if(data[outlierSet[i]][0].startsWith("1") || data[outlierSet[i]][0].startsWith("4")){
rareNum++;
}
/*
if(data[outlierSet[i]][0].startsWith("3") || data[outlierSet[i]][0].startsWith("4") || data[outlierSet[i]][0].startsWith("5") ||
data[outlierSet[i]][0].startsWith("7") || data[outlierSet[i]][0].startsWith("8") || data[outlierSet[i]][0].startsWith("9") ||
data[outlierSet[i]][0].startsWith("14") || data[outlierSet[i]][0].startsWith("15")){
rareNum++;
}*/
}
System.out.println("Number of identified outliers:"+rareNum +" among "+ k +" objects");
}
//general grredy algorithm
public void greedyAlg() {
//Step 1: Intialization
for (int i = 0; i < rowCount; i++) {
data[i][colCount] = "0";
for (int m = 1; m < colCount; m++) {
Integer itTmp = (Integer) (density[m].get(data[i][m])); //
if (itTmp != null) {
int intTmp = ( (Integer) (density[m].get(data[i][m]))).intValue() + 1;
density[m].put(data[i][m], new Integer(intTmp));
}
else {
density[m].put(data[i][m], new Integer(1));
}
}
}
//Step 2: Greedy Procedure
int counter = 0;
while (counter<k) {
counter++;
int remained = rowCount-counter;
System.out.println(counter);
double tmp = 1000000;
int tmpPos = -1;
for (int i = 0; i < rowCount; i++) {
if(data[i][colCount]!="1"){
//computing the decreased entropy value
double delta =0;
for (int m = 1; m < colCount; m++) {
//itTmpNO must not be null!!
Integer itTmpNO = (Integer) (density[m].get(data[i][m]));
double v_decrease = itTmpNO.intValue() - 1;
if(v_decrease==0){
delta = delta -(-((v_decrease+1)/remained)*Math.log((v_decrease+1)/remained));
}
else{
delta = delta +(-(v_decrease/remained)*Math.log(v_decrease/remained))
-(-((v_decrease+1)/remained)*Math.log((v_decrease+1)/remained));
}
}
if(tmp>delta){
tmp=delta;
tmpPos = i;
}
}
}
data[tmpPos][colCount] = "1";
}
//Step 3: Output top-k outliers
int rareNum = 0;
for(int i=0;i<rowCount;i++){
if(data[i][colCount]=="1"){
System.out.println(data[i][0]);
if(data[i][0].startsWith("2") || data[i][0].startsWith("2")){
rareNum++;
}
/*if(data[i][0].startsWith("3") || data[i][0].startsWith("4") || data[i][0].startsWith("5") ||
data[i][0].startsWith("7") || data[i][0].startsWith("8") || data[i][0].startsWith("9") ||
data[i][0].startsWith("14") || data[i][0].startsWith("15")){
rareNum++;
}*/
}
}
System.out.println("Number of identified outliers:"+rareNum +" among "+ k +" objects");
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -