📄 partition.java
字号:
if (fitnessMeasure == Global.FM_COS)
{
double sum_square_weights = 0.0;
for (int i = 0; i < NPART; i++)
sum_square_weights += Math.pow(weight[i], 2);
fitness /= Math.sqrt(sumSquareEntropies)*Math.sqrt(sum_square_weights);
}
}
/** computes the fitness of this Partition with respect to the
* partitions in collection c, according to the entropy measure and
* distance measure */
public void computeFitness(Partition[] c, int NPART,
int entropyMeasure, int fitnessMeasure,
double[] wTargetCondOnDB,
double[] wDBCondOnTarget,
double[] weight)
{
// stores in intersectMap[i]:
// <class of our partition, <class of c[i], intersection count >>
// intersectMap[i] is a map for intersections between
// partitions c[i] and our partition
// intersectMap[i]:key corresponds to each distinct
// value in our partition (C_s)
// intersectMap[i]:value corresponds to a map with
// intersections between c[i][t] and v[s] (B_t x C_s)
// intersectMap[i] = <C_s, <B_t, B_t x C_s >>
Hashtable[] intersectMap = new Hashtable[NPART];
for (int i = 0; i < NPART; i++)
intersectMap[i] = new Hashtable();
// fills intersectMap
computeIntersections(this, c, NPART, intersectMap);
computeFitness(intersectMap, c, NPART, entropyMeasure, fitnessMeasure,
wTargetCondOnDB, wDBCondOnTarget, weight);
}
/** estimates the degree in which the other partitions in
* <code>db</code> influences the target partition; the target
* partition should be the last partition in <code>db</code>;
* <code>sampleDBPct</code> specifies the percent of the database to
* be sampled; sets <code>wTargetCondonDB[i]</code> to
* H(db[i]|db[target]) and sets the <code>wDBCondOnTarget[i]</code>
* to H(db[target]|db[i]); sets <code>weight[i]</code> to
* wTargetCondOnDB[i]+wDBCondOnTarget[i]
@return the entropy of the partition sampled from the database */
static public double estimateWeights(Partition[] db, int NPART,
double sampleDBPct,
int entropyMeasure,
double[] wTargetCondOnDB,
double[] wDBCondOnTarget,
double[] weight, Random rand)
{
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(3);
double targetEntr = 0;
int SIZE = db[0].getSize();
// reference partition in the sample database
Partition sampleRefPart = null;
// create a separate sample database
int sampleDBSize = (int)(SIZE*sampleDBPct);
// consider at least a sample db of size 2
if (sampleDBSize < 2)
sampleDBSize = 2;
// rowids of the database that will belong to the sample
int[] sampleRowids = new int[sampleDBSize];
SelectionMethods.selectUnif(SIZE, sampleRowids, sampleDBSize, rand);
Partition[] sampleDB = new Partition[NPART];
for (int i = 0; i < NPART; i++)
sampleDB[i] = new Partition(sampleDBSize,
db[0].getMaxNoClasses(),
db[0].getClassFitnessType());
if (Global.VERBOSE == 1)
{
for (int i = 0; i < sampleDBSize; i++)
System.out.print(sampleRowids[i] + " ");
System.out.println();
}
// extract the sample database from the original database
int index = 0;
for (int i = 0; i < sampleDBSize; i++, index++)
for (int j = 0; j < NPART; j++)
sampleDB[j].set(index, db[j].get(sampleRowids[i]));
if (Global.VERBOSE == 1)
for (int i = 0; i < NPART; i++)
sampleDB[i].print();
// the reference partition is the last column
sampleRefPart = sampleDB[NPART - 1];
// compute weights of all attributes with respect to the reference
// partition using the sampleDB
// the fitness of sampleDB[i] represents the weight
// weight[i] = H(sampleDB[i]|ref_part) + H(ref_part|sampleDB[i])
Partition[] c = new Partition[1];
c[0] = sampleRefPart;
for (int i = 0; i < NPART - 1; i++)
{
sampleDB[i].computeFitness(c, 1, entropyMeasure, Global.FM_P_PA,
null, null, null);
wDBCondOnTarget[i] = sampleDB[i].getFitness();
sampleDB[i].computeFitness(c, 1, entropyMeasure, Global.FM_PA_P,
null, null, null);
wTargetCondOnDB[i] = sampleDB[i].getFitness();
weight[i] = wTargetCondOnDB[i] + wDBCondOnTarget[i];
}
targetEntr = sampleRefPart.entropy(entropyMeasure);
if (Global.VERBOSE == 1)
{
System.out.println("attr_no weight_target_cond_on_db" +
" weight_db_cond_on_target sum");
for (int i = 0; i < NPART - 1; i++)
System.out.println(i + " " + nf.format(wTargetCondOnDB[i]) + " "
+ nf.format(wDBCondOnTarget[i])
+ " " + nf.format(weight[i]));
System.out.println("H(target) = " + nf.format(targetEntr));
System.out.println("H(target)-H(target|db) H(db)-H(db|target)");
double avgDBInfluence = 0.0;
for (int i = 0; i < NPART - 1; i++)
{
System.out.println(i + " "
+ nf.format(new Double(targetEntr - wTargetCondOnDB[i]))
+ " " +
nf.format(new Double(sampleDB[i].entropy(entropyMeasure)
- wDBCondOnTarget[i])));
avgDBInfluence += targetEntr - wTargetCondOnDB[i];
}
System.out.println("avg H(target)-H(target|db) = "
+ nf.format(new Double(avgDBInfluence/(double)(NPART-1))));
}
return targetEntr;
}
/** @return the symmetrical difference between the classes of the
* current partition and the one received as argument */
public double distSymDiff(Partition c)
{
// get the unique representation into classes
int noClasses1 = getNoClasses();
Hashtable[] am1 = new Hashtable[noClasses1];
for (int i = 0; i < noClasses1; i++)
am1[i] = new Hashtable();
getAM(am1);
// get the unique representation into classes
int noClasses2 = c.getNoClasses();
Hashtable[] am2 = new Hashtable[noClasses2];
for (int i = 0; i < noClasses2; i++)
am2[i] = new Hashtable();
c.getAM(am2);
long symDiff = 0;
for (int k2 = 0; k2 < noClasses2; k2++)
symDiff += (am2[k2].size()*am2[k2].size());
for (int k1 = 0; k1 < noClasses1; k1++)
{
symDiff += (am1[k1].size()*am1[k1].size());
for (int k2 = 0; k2 < noClasses2; k2++)
{
long common = 0;
for (Enumeration keys1 = am1[k1].keys();
keys1.hasMoreElements(); )
if (am2[k2].containsKey((Integer)keys1.nextElement()) == true)
common++;
symDiff -= (2*common*common);
}
}
return ((double)symDiff)/2;
}
/** @return Sum distSymDiff(p), for all p in the collection c */
public double sumDistSymDiff(Partition[] c, int NPART)
{
double dist = 0.0;
for (int i = 0; i < NPART; i++)
dist += distSymDiff(c[i]);
return dist;
}
/** @return the average Hamming distance between the row
* <code>rowid</code> and all other rows from the class
* <code>c</code>, according to the values of the <code>NPART</code>
* partitions in <code>db</code> on these rows */
private double avgHD(int rowid, Hashtable c, Partition[] db, int NPART)
{
long avgHD = 0;
for (Enumeration keys = c.keys();
keys.hasMoreElements(); )
{
int currRowid = ((Integer)keys.nextElement()).intValue();
if (currRowid != rowid)
{
long currHD = 0;
for (int j = 0; j < NPART; j++)
if (db[j].v[rowid] != db[j].v[currRowid])
currHD++;
avgHD += currHD;
}
}
return ((double)avgHD)/((double)c.size());
}
/** fills the array <code>silhouettes</code> of dimension 'size'
* with the silhoutte coefficients, computed with respect to the
* classes in this Partition and with respect to the values of the
* <code>NPART</code> partitions in collection <code>c</code>; this
* Partition should have at least 2 classes; <code>neighbors</code>
* will contain for each rowid the class number of the closest
* class */
public void computeSilhouettes(double[] silhouettes,
int[] neighbors,
Partition[] c, int NPART)
{
// noClasses should be greater than 2
int noClasses = getNoClasses();
if (Global.DEBUG == 1)
if (noClasses < 2)
{
System.out.println("ERROR! Partition.computeSilhouettes():"
+ "noClasses < 2");
System.exit(1);
}
Hashtable[] am = new Hashtable[noClasses];
for (int i = 0; i < noClasses; i++)
am[i] = new Hashtable();
getAM(am);
for (int rowid = 0; rowid < size; rowid++)
{
// find to what class rowid belongs
int classNo;
for (classNo = 0; classNo < noClasses; classNo++)
if (am[classNo].containsKey(new Integer(rowid)) == true)
break;
// rowid belongs to classNo
double a = avgHD(rowid, am[classNo], c, NPART);
// find average distances with respect to all other classes
// (classNo, avgHD)
Hashtable otherClassesHD = new Hashtable();
for (int k = 0; k < noClasses; k++)
if (k != classNo)
otherClassesHD.put(new Integer(k),
new Double(avgHD(rowid, am[k], c, NPART)));
// find the class with smallest avgHD
// we know that we have at least 2 classes at this point
Enumeration keys = otherClassesHD.keys();
Integer firstClass = (Integer)keys.nextElement();
double b = ((Double)otherClassesHD.get(firstClass)).doubleValue();
neighbors[rowid] = firstClass.intValue();
for (Enumeration classes = otherClassesHD.keys();
classes.hasMoreElements(); )
{
Integer currClass = (Integer)classes.nextElement();
double currValue =
((Double)otherClassesHD.get(currClass)).doubleValue();
if (currValue < b)
{
b = currValue;
neighbors[rowid] = currClass.intValue();
}
}
// if rowid is the only element is that class
if (am[classNo].size() == 1)
silhouettes[rowid] = 0;
else if (a < b)
silhouettes[rowid] = 1 - a/b;
else if (a == b)
silhouettes[rowid] = 0;
else
silhouettes[rowid] = b/a - 1;
}
}
private class SilhInfo implements Comparable
{
public int rowid;
public double silhouette;
public int neighbor;
SilhInfo(int rowid, double silhouette, int neighbor)
{
this.rowid = rowid;
this.silhouette = silhouette;
this.neighbor = neighbor;
}
public int compareTo(Object o)
{
if (!(o instanceof SilhInfo))
throw new ClassCastException();
if (silhouette < ((SilhInfo)o).silhouette)
return -1;
else
if (silhouette > ((SilhInfo)o).silhouette)
return 1;
else
return 0;
}
}
/** plots a graphical representation of the silhouettes; classes are
* plotted in order and the elements in the classes are plotted in
* decreasing order of their silhouette */
public void plotSilhouettes(double[] silhouettes, int[] neighbors)
{
NumberFormat nf = NumberFormat.getInstance();
nf.setMaximumFractionDigits(3);
int noClasses = getNoClasses();
Hashtable[] am = new Hashtable[noClasses];
for (int i = 0; i < noClasses; i++)
am[i] = new Hashtable();
getAM(am);
double avgSilh = 0;
for (int rowid = 0; rowid < size; rowid++)
avgSilh += silhouettes[rowid];
System.out.println("Average silhouette " + nf.format(avgSilh/(double)size));
for (int classNo = 0; classNo < noClasses; classNo++)
{
Heap silhToPrint = new Heap(noClasses);
for (Enumeration rowids = am[classNo].keys();
rowids.hasMoreElements();)
{
int currRowid = ((Integer)rowids.nextElement()).intValue();
silhToPrint.insert(new SilhInfo(currRowid, silhouettes[currRowid],
neighbors[currRowid]));
}
while (silhToPrint.size() != 0)
{
SilhInfo s = null;
try
{
s = (SilhInfo)silhToPrint.remove();
}
catch(EmptyHeapException e)
{
System.err.println("Internal Error: Partition.plotSilhouettes(): empty heap");
System.exit(1);
}
double currSilh = s.silhouette;
System.out.print("c " + (classNo + 1)
+ " n " + (s.neighbor + 1)
+ " r " + s.rowid + "\t");
if (currSilh < 0)
{
int i = 0;
while(i++ < -30*currSilh)
System.out.print("-");
}
else
{
int i = 0;
while(i++ < 30*currSilh)
System.out.print("+");
}
System.out.println(" (" + currSilh + ")");
}
}
}
/** appends to <code>output</code> a graphical representation of the
* silhouettes; classes are plotted in order and the elements in the
* classes are plotted in decreasing order of their silhouette */
public void plotSilhouettes(double[] silhouettes, int[] neighbors,
StringBuffer output)
{
int noClasses = getNoClasses();
Hashtable[] am = new Hashtable[noClasses];
for (int i = 0; i < noClasses; i++)
am[i] = new Hashtable();
getAM(am);
double avgSilh = 0;
for (int rowid = 0; rowid < size; rowid++)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -