📄 standarlization.java
字号:
/* created at 2005-12-20 */
package com.clustering.core;
import java.io.CharArrayWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import com.clustering.data.Record;
import com.clustering.data.SimpleSource;
import com.clustering.data.Source;
import com.clustering.data.SourceInfo;
/**
* 将数据源转为标准数据源,即把属性的 值化到[0,1]中
*
* @author Avon
* @version 0.9
* @since 0.9
*/
public class Standarlization {
private Source source;
private Source standardSource;
public Standarlization() {
}
public Source getSource() {
return source;
}
public void setSource(Source source) {
this.source = source;
}
/**
* 根据@param source数据源获取经过标准化的数据源
*
* @return 经过标准化的数据源 应该写成static ? 看来能,TODO 把Standarlization写成工厂
*/
public Source getSimpleStandarlizationSource() {
// 没有放在构造函数中,不要求一开始就指定source
if (null == standardSource) {
if (null == source)
throw new IllegalArgumentException("Property source is not specified.");
else if (0 == source.getSourceInfo().getRecordNum())
throw new IllegalArgumentException("No record in data source.");
else if (0 == source.getSourceInfo().getColumnNum())
throw new IllegalArgumentException("Record which contains no attributes can not be computed.");
Class cls = source.getClass();
/*
* 目前实现的SimpleSource只有SimpleFileSystemSourceImpl,而且这个类是FileSystemSourceFactory的内部类
* 这个方法并不稳定,如果要创建新的SimpleSource,新的SimpleSource应该也使用getSimpleStandarlizationSource创建,
* 因此创建一个新的数据源就要修改这个方法 Standardlization#checkSource(Source)方法是此处的冗余
*/
boolean canUseThisMethod = false;
if (SimpleSource.class.isAssignableFrom(cls)) {
canUseThisMethod = true;
}
if (!canUseThisMethod)
throw new UnsupportedClassVersionError(
"Current version only support FileSystemSourceFactory$SimpleFileSystemSourceImpl. "
+ "If u have implemented a new SimpleSource, modify method "
+ "Standarlization#getSimpleStandarlizationSource().");
// 这样判断后,就不怕long型的colNum和rowNum了,可以直接将他们转成int
// means 均值 diff 标准差
double[] means = getMeans();
double[] diff = getDiff(means);
standardSource = new StandarlizationSource(means, diff);
}
return standardSource;
}
/*
* 求均值 写的不好,看有没有算法能优化一下 获取每一列的均值
*/
private double[] getMeans() {
Iterator iter = source.iterator();
int colNum = (int) source.getSourceInfo().getColumnNum();
double[] doubles = new double[colNum];
while (iter.hasNext()) {
Record record = (Record) iter.next();
for (int i = 0; i < colNum; i++) {
doubles[i] += record.getItem(i);
}
}
int recodNum = (int) source.getSourceInfo().getRecordNum();
for (int i = 0; i < doubles.length; i++) {
doubles[i] /= recodNum;
}
return doubles;
}
/*
* 求方差 写的不好
*/
private double[] getDiff(double[] means) {
Iterator iter = source.iterator();
int colNum = (int) source.getSourceInfo().getColumnNum();
if (null == iter)
throw new RuntimeException("Current release放松:) does only support FileSystemSource.");
double[] doubles = new double[colNum];
while (iter.hasNext()) {
Record record = (Record) iter.next();
for (int i = 0; i < colNum; i++) {
doubles[i] += (record.getItem(i) - means[i]) * (record.getItem(i) - means[i]);
}
}
for (int i = 0; i < doubles.length; i++) {
// 0居然能开平方 :))
doubles[i] = Math.sqrt(doubles[i] / source.getSourceInfo().getRecordNum());
}
return doubles;
}
class StandarlizationSource implements SimpleSource {
private ArrayList<Record> records;
// default value
private double[] dValue;
/*
* 标准阵也只是一个中间步骤,只会在创建相似矩阵的时候使用一次,因此这里把真正的标准值的计算推迟到RecordImpl的getItem(long)
*/
private double[] max;
private double[] min;
/*
* 0被标准化后的值,也就是是标准化矩阵中大量存在的值 dValue: default value
*/
StandarlizationSource(double[] means, double[] diff) {
// 简单数据源
records = new ArrayList<Record>((int) source.getSourceInfo().getRecordNum());
dValue = new double[(int) source.getSourceInfo().getColumnNum()];
max = new double[dValue.length];
min = new double[dValue.length];
for (int i = 0; i < dValue.length; i++) {
/*
* problem 5.3.3.1 在对列聚类并且有一行都相同的时,或对行聚类一列都相同时,才会发生这种情况
* 目前的认识是如果一行或一列都是0,那么就把这行或列标准化的值就都是0
* 如果一行或一列的值相同且不是0,这种情况还没有处理手段办法,只能报错
*/
if (0 == diff[i]) {
// 如果diff[i] = 0 ,那么这列的所有元素都相同
if (means[i] == 0) {
// 该列/行的属性不添加到source中,让RecordImpl的getItem(long)返回dValue
// 注意doit(double[], double[])中会标准化dValue
max[i] = 0;
continue;
} else {
StringBuffer buffer = new StringBuffer();
Iterator<Record> iterator = source.iterator();
buffer.append("problem 5.3.3.1 occurs.\n\t transposable: ");
buffer.append(source.isTransposable());
buffer.append("\n\tcolumn number: ");
buffer.append(i);
buffer.append("\n\t");
while (iterator.hasNext()) {
Record record = iterator.next();
buffer.append(record.getItem(i));
buffer.append(" ");
}
buffer.append("\n");
throw new RuntimeException(buffer.toString());
}
}
// problem5.3.3.1 end
max[i] = min[i] = dValue[i] = -means[i] / diff[i];
}
doit(means, diff);
}
/*
* 按照Source建立标准阵
*/
private void doit(double[] means, double[] diff) {
int colNum = (int) source.getSourceInfo().getColumnNum();
Iterator iter = source.iterator();
while (iter.hasNext()) {
Record original = (Record) iter.next();
RecordImpl standard = new RecordImpl();
// 如果java有友元就好了
for (int i = 0; i < colNum; i++) {
// 0是source的defalut value
if (original.getItem(i) == 0)
continue;
/*
* problem5.3.3.1 参考StandarlizationSource(double[],
* double[]) StandarlizationSource(double[],
* double[])已经添加了逻辑,这里只需简单的continue,从而不向source中添加任何信息,
* 让RecordImpl的getItem(long)返回dValue
*/
if (0 == diff[i]) {
continue;
}
// problem5.3.3.1 end
// 参考文档,原始数据标准化公式
double d = (original.getItem(i) - means[i]) / diff[i];
if (max[i] < d) {
max[i] = d;
}
if (min[i] > d) {
min[i] = d;
}
standard.items.put(i, new Double(d));
}
records.add(standard);
}
// 标准化dValue,由于稀疏,可以避免重复计算
for (int i = 0; i < dValue.length; i++) {
if (0 == max[i]) {
dValue[i] = 0;
} else {
dValue[i] = (dValue[i] - min[i]) / (max[i] - min[i]);
}
}
}
public boolean isTransposable() {
return source.isTransposable();
}
public void destroy() {
// do nothing
}
public Record getRecord(int index) {
return records.get(index);
}
public SourceInfo getSourceInfo() {
return source.getSourceInfo();
}
public Iterator<Record> iterator() {
return records.iterator();
}
@Override
public String toString() {
int rowNum = (int)source.getSourceInfo().getRecordNum();
int colNum = (int)source.getSourceInfo().getColumnNum();
CharArrayWriter cWriter = new CharArrayWriter();
PrintWriter out = new PrintWriter(cWriter);
out.print("There are ");
out.print(rowNum);
out.print(" records in standard source. Each record has ");
out.print(colNum);
out.println(" attributes.");
Iterator<Record> iter = records.iterator();
int count = 0;
while(iter.hasNext()) {
out.println();
out.print("Record ");
out.print(count++);
out.println(" :");
Record record = iter.next();
for(int i=0;i<colNum;i++) {
out.printf("%.3f\t",record.getItem(i));
}
out.println();
}
out.close();
cWriter.close();
return cWriter.toString();
}
private class RecordImpl implements Record {
private static final long serialVersionUID = 1L;
HashMap<Integer, Double> items;
RecordImpl() {
// 参考FileSystemSource$RecordImpl
items = new HashMap<Integer, Double>(8, 0.9F);
}
@Deprecated
public double[] getAsArray() {
double[] doubles = new double[(int) source.getSourceInfo().getColumnNum()];
Iterator<Integer> key = items.keySet().iterator();
for (int i = 0; i < doubles.length; i++) {
doubles[i] = dValue[i];
}
while (key.hasNext()) {
int col = key.next();
double value = items.get(col);
doubles[col] = value;
}
return doubles;
}
public double getItem(long column) {
if (column > source.getSourceInfo().getColumnNum())
throw new RuntimeException("There are only " + source.getSourceInfo().getColumnNum()
+ " columns. The column " + column + " u access does not exist.");
int col = (int) column;
if (items.containsKey(col)) {
double d = items.get(col).doubleValue();
// 标准矩阵也只是一个中间步骤,只会被访问一次,认为这样优化是可以的
return (d - min[col]) / (max[col] - min[col]);
} else
return dValue[(int) col];
}
}
}
public Source getComplexStandarlizationSource() {
throw new UnsupportedClassVersionError("Current release does not support complex file system source.");
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -