⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 standarlization.java

📁 dm s preparing process. In this case we use O distance.
💻 JAVA
字号:
/* created at 2005-12-20 */
package com.clustering.core;

import java.io.CharArrayWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;

import com.clustering.data.Record;
import com.clustering.data.SimpleSource;
import com.clustering.data.Source;
import com.clustering.data.SourceInfo;

/**
 * 将数据源转为标准数据源,即把属性的 值化到[0,1]中
 * 
 * @author Avon
 * @version 0.9
 * @since 0.9
 */
public class Standarlization {
	private Source source;

	private Source standardSource;

	public Standarlization() {

	}

	public Source getSource() {
		return source;
	}

	public void setSource(Source source) {
		this.source = source;
	}

	/**
	 * 根据@param source数据源获取经过标准化的数据源
	 * 
	 * @return 经过标准化的数据源 应该写成static ? 看来能,TODO 把Standarlization写成工厂
	 */
	public Source getSimpleStandarlizationSource() {
		// 没有放在构造函数中,不要求一开始就指定source
		if (null == standardSource) {
			if (null == source)
				throw new IllegalArgumentException("Property source is not specified.");
			else if (0 == source.getSourceInfo().getRecordNum())
				throw new IllegalArgumentException("No record in data source.");
			else if (0 == source.getSourceInfo().getColumnNum())
				throw new IllegalArgumentException("Record which contains no attributes can not be computed.");

			Class cls = source.getClass();
			/*
			 * 目前实现的SimpleSource只有SimpleFileSystemSourceImpl,而且这个类是FileSystemSourceFactory的内部类
			 * 这个方法并不稳定,如果要创建新的SimpleSource,新的SimpleSource应该也使用getSimpleStandarlizationSource创建,
			 * 因此创建一个新的数据源就要修改这个方法 Standardlization#checkSource(Source)方法是此处的冗余
			 */
			boolean canUseThisMethod = false;
			if (SimpleSource.class.isAssignableFrom(cls)) {
				canUseThisMethod = true;
			}
			if (!canUseThisMethod)
				throw new UnsupportedClassVersionError(
						"Current version only support FileSystemSourceFactory$SimpleFileSystemSourceImpl. "
								+ "If u have implemented a new SimpleSource, modify method "
								+ "Standarlization#getSimpleStandarlizationSource().");
			// 这样判断后,就不怕long型的colNum和rowNum了,可以直接将他们转成int

			// means 均值 diff 标准差
			double[] means = getMeans();
			double[] diff = getDiff(means);
			standardSource = new StandarlizationSource(means, diff);
		}
		return standardSource;
	}

	/*
	 * 求均值 写的不好,看有没有算法能优化一下 获取每一列的均值
	 */
	private double[] getMeans() {
		Iterator iter = source.iterator();
		int colNum = (int) source.getSourceInfo().getColumnNum();
		double[] doubles = new double[colNum];
		while (iter.hasNext()) {
			Record record = (Record) iter.next();
			for (int i = 0; i < colNum; i++) {
				doubles[i] += record.getItem(i);
			}
		}
		int recodNum = (int) source.getSourceInfo().getRecordNum();
		for (int i = 0; i < doubles.length; i++) {
			doubles[i] /= recodNum;
		}
		return doubles;
	}

	/*
	 * 求方差 写的不好
	 */
	private double[] getDiff(double[] means) {
		Iterator iter = source.iterator();
		int colNum = (int) source.getSourceInfo().getColumnNum();
		if (null == iter)
			throw new RuntimeException("Current release放松:) does only support FileSystemSource.");
		double[] doubles = new double[colNum];
		while (iter.hasNext()) {
			Record record = (Record) iter.next();
			for (int i = 0; i < colNum; i++) {
				doubles[i] += (record.getItem(i) - means[i]) * (record.getItem(i) - means[i]);
			}
		}

		for (int i = 0; i < doubles.length; i++) {
			// 0居然能开平方 :))
			doubles[i] = Math.sqrt(doubles[i] / source.getSourceInfo().getRecordNum());
		}
		return doubles;
	}

	class StandarlizationSource implements SimpleSource {

		private ArrayList<Record> records;

		// default value
		private double[] dValue;

		/*
		 * 标准阵也只是一个中间步骤,只会在创建相似矩阵的时候使用一次,因此这里把真正的标准值的计算推迟到RecordImpl的getItem(long)
		 */
		private double[] max;

		private double[] min;

		/*
		 * 0被标准化后的值,也就是是标准化矩阵中大量存在的值 dValue: default value
		 */
		StandarlizationSource(double[] means, double[] diff) {

			// 简单数据源
			records = new ArrayList<Record>((int) source.getSourceInfo().getRecordNum());

			dValue = new double[(int) source.getSourceInfo().getColumnNum()];
			max = new double[dValue.length];
			min = new double[dValue.length];
			for (int i = 0; i < dValue.length; i++) {

				/*
				 * problem 5.3.3.1 在对列聚类并且有一行都相同的时,或对行聚类一列都相同时,才会发生这种情况
				 * 目前的认识是如果一行或一列都是0,那么就把这行或列标准化的值就都是0
				 * 如果一行或一列的值相同且不是0,这种情况还没有处理手段办法,只能报错
				 */
				if (0 == diff[i]) {
					// 如果diff[i] = 0 ,那么这列的所有元素都相同
					if (means[i] == 0) {
						// 该列/行的属性不添加到source中,让RecordImpl的getItem(long)返回dValue
						// 注意doit(double[], double[])中会标准化dValue
						max[i] = 0;
						continue;
					} else {
						StringBuffer buffer = new StringBuffer();
						Iterator<Record> iterator = source.iterator();
						buffer.append("problem 5.3.3.1 occurs.\n\t transposable: ");
						buffer.append(source.isTransposable());
						buffer.append("\n\tcolumn number: ");
						buffer.append(i);
						buffer.append("\n\t");
						while (iterator.hasNext()) {
							Record record = iterator.next();
							buffer.append(record.getItem(i));
							buffer.append(" ");
						}
						buffer.append("\n");
						throw new RuntimeException(buffer.toString());
					}
				}
				// problem5.3.3.1 end

				max[i] = min[i] = dValue[i] = -means[i] / diff[i];
			}
			doit(means, diff);
		}

		/*
		 * 按照Source建立标准阵
		 */
		private void doit(double[] means, double[] diff) {
			int colNum = (int) source.getSourceInfo().getColumnNum();

			Iterator iter = source.iterator();
			while (iter.hasNext()) {
				Record original = (Record) iter.next();
				RecordImpl standard = new RecordImpl();
				// 如果java有友元就好了
				for (int i = 0; i < colNum; i++) {
					// 0是source的defalut value
					if (original.getItem(i) == 0)
						continue;

					/*
					 * problem5.3.3.1 参考StandarlizationSource(double[],
					 * double[]) StandarlizationSource(double[],
					 * double[])已经添加了逻辑,这里只需简单的continue,从而不向source中添加任何信息,
					 * 让RecordImpl的getItem(long)返回dValue
					 */
					if (0 == diff[i]) {
						continue;
					}
					// problem5.3.3.1 end

					// 参考文档,原始数据标准化公式
					double d = (original.getItem(i) - means[i]) / diff[i];
					if (max[i] < d) {
						max[i] = d;
					}
					if (min[i] > d) {
						min[i] = d;
					}
					standard.items.put(i, new Double(d));
				}
				records.add(standard);
			}
			// 标准化dValue,由于稀疏,可以避免重复计算
			for (int i = 0; i < dValue.length; i++) {
				if (0 == max[i]) {
					dValue[i] = 0;
				} else {
					dValue[i] = (dValue[i] - min[i]) / (max[i] - min[i]);
				}
			}
		}

		public boolean isTransposable() {
			return source.isTransposable();
		}

		public void destroy() {
			// do nothing
		}

		public Record getRecord(int index) {
			return records.get(index);
		}

		public SourceInfo getSourceInfo() {
			return source.getSourceInfo();
		}

		public Iterator<Record> iterator() {
			return records.iterator();
		}
		@Override
		public String toString() {
			int rowNum = (int)source.getSourceInfo().getRecordNum();
			int colNum = (int)source.getSourceInfo().getColumnNum();
			CharArrayWriter cWriter = new CharArrayWriter();
			PrintWriter out = new PrintWriter(cWriter);
			out.print("There are ");
			out.print(rowNum);
			out.print(" records in standard source. Each record has ");
			out.print(colNum);
			out.println(" attributes.");
			Iterator<Record> iter = records.iterator();
			int count = 0;
			while(iter.hasNext()) {
				out.println();
				out.print("Record ");
				out.print(count++);
				out.println(" :");
				Record record = iter.next();
				for(int i=0;i<colNum;i++) {
					out.printf("%.3f\t",record.getItem(i));
				}
				out.println();
			}
			out.close();
			cWriter.close();
			return cWriter.toString();
		}
		private class RecordImpl implements Record {
			private static final long serialVersionUID = 1L;

			HashMap<Integer, Double> items;

			RecordImpl() {
				// 参考FileSystemSource$RecordImpl
				items = new HashMap<Integer, Double>(8, 0.9F);
			}

			@Deprecated
			public double[] getAsArray() {
				double[] doubles = new double[(int) source.getSourceInfo().getColumnNum()];
				Iterator<Integer> key = items.keySet().iterator();
				for (int i = 0; i < doubles.length; i++) {
					doubles[i] = dValue[i];
				}
				while (key.hasNext()) {
					int col = key.next();
					double value = items.get(col);
					doubles[col] = value;
				}
				return doubles;
			}

			public double getItem(long column) {
				if (column > source.getSourceInfo().getColumnNum())
					throw new RuntimeException("There are only " + source.getSourceInfo().getColumnNum()
							+ " columns. The column " + column + " u access does not exist.");
				int col = (int) column;
				if (items.containsKey(col)) {
					double d = items.get(col).doubleValue();
					// 标准矩阵也只是一个中间步骤,只会被访问一次,认为这样优化是可以的
					return (d - min[col]) / (max[col] - min[col]);
				} else
					return dValue[(int) col];
			}
		}
	}

	public Source getComplexStandarlizationSource() {
		throw new UnsupportedClassVersionError("Current release does not support complex file system source.");
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -