📄 hashforest.c
字号:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <time.h>
#include <sys/resource.h>
#include "IntList.h"
#include "Param.h"
#include "NPoint.h"
#include "NArray.h"
#include "HashTree.h"
#include "DensityMap.h"
#include "HashForest.h"
// Constructor
HashForest::HashForest(char* p, EntStat es)
:sta(es), dataset(NULL)
{
FILE* fp;
int i;
// Read the parameters from parameter file
fp = fopen(p,"rt");
if (fp==NULL) {
printf("Cannot open param.ini\n");
exit(0);
}
fscanf(fp,"%d\n", &prm.dim);
fscanf(fp,"%f\n%f\n%f\n", &prm.min_val, &prm.max_val, &prm.int_size);
fscanf(fp,"%d\n%d\n", &brFactor,&data_row);
fscanf(fp,"%s\n",cluster_file);
fclose(fp);
// Create prm.dim HashTree pointers
nonsig = new (HashTree*)[prm.dim];
sig = new (HashTree*)[prm.dim];
// Instantiate prm.dim HashTree
for (i=0; i<prm.dim; i++) {
nonsig[i] = new HashTree(brFactor,i+1);
sig[i] = new HashTree(brFactor,i+1);
}
}
// Destructor
HashForest::~HashForest()
{
int i;
for (i=0; i<prm.dim; i++) {
delete nonsig[i];
delete sig[i];
}
delete[] nonsig;
delete[] sig;
if (dataset!=NULL) {
// then release of dataset is necessary
for (i=0; i<data_row; i++) {
delete[] dataset[i];
}
delete[] dataset;
}
}
// The Mining Task
void HashForest::mine()
{
DensityMap* dm;
IntList il;
int cur_dim = 0;
int i;
float start_time;
clock_t last_clock;
struct rusage ru;
// Report the parameters first
printf("The dimensionality is %d.\n", prm.dim);
printf("The thresholds are %f, %f and %f.\n",
sta.entropy, sta.interest, sta.interest_gain);
if (READ_ONCE) readfile(); // Read the file into memory
// Mark the time
last_clock = clock();
getrusage(RUSAGE_SELF, &ru);
start_time = ru.ru_utime.tv_sec + ru.ru_stime.tv_sec +
float(ru.ru_utime.tv_usec+ru.ru_stime.tv_usec)/1e6;
// Let nonsig[0] be all the individual dimensions
for (i=0; i<prm.dim; i++) {
il.clear();
il.insertSorted(i);
nonsig[0]->insert(il);
}
// Process pass by pass
while (cur_dim<prm.dim) {
if (nonsig[cur_dim]->get_no_ss()==0) goto exit_loop;
// Count density and calculate entropy
dm = new DensityMap(prm,*nonsig[cur_dim]);
dm->build_grid(dataset, *this, *nonsig[cur_dim]);
delete dm;
// Show dimensions and number of candidate subspaces
printf("\nPass %d: no of candidate subspaces = %d\n", cur_dim+1, nonsig[cur_dim]->get_no_ss());
// Apply the thresholds
nonsig[cur_dim]->applyThreshold(sta,*sig[cur_dim]);
#if 1
// Show large sets
printf("Non-significant subspaces (%d):\n",nonsig[cur_dim]->get_no_ss());
nonsig[cur_dim]->show();
printf("Significant subspaces (%d):\n",sig[cur_dim]->get_no_ss());
sig[cur_dim]->show();
printf("\n");
#endif
// Report time
printf("Pass %d: %f secs used\n", cur_dim+1,float(clock()-last_clock)/CLOCKS_PER_SEC);
last_clock = clock();
// Generate next pass
nonsig[cur_dim]->group(*nonsig[cur_dim+1]);
cur_dim++;
}
exit_loop:
// Report time
getrusage(RUSAGE_SELF, &ru);
printf("The rusage = %f secs\n\n",ru.ru_utime.tv_sec+ru.ru_stime.tv_sec+
float(ru.ru_utime.tv_usec+ru.ru_stime.tv_usec)/1e6 - start_time);
}
// Find the entropy of an individual dimension
float HashForest::entropy_dim(int d)
{
IntList is;
is.prepend(d);
return nonsig[0]->findStat(is).entropy;
}
// Find the statistics of a given subspace
EntStat HashForest::findStat(const IntList& ss)
{
if (ss.get_len() > prm.dim) {
printf("HashForest::findStat(): Warning: len of subspace > maximum dimension\n");
}
return nonsig[ss.get_len()-1]->findStat(ss);
}
// Read the data file into the variable dataset
void HashForest::readfile()
{
FILE* fp;
int i,count = 0;
float f;
printf("The predefined number of rows is %d.\n", data_row);
// Allocate a data_row*prm.dim array
dataset = new (float*)[data_row];
for (i=0; i<data_row; i++) {
dataset[i] = new float[prm.dim];
}
// Open file
fp = fopen(cluster_file,"rt");
if (fp==NULL) {
printf("DensityMap::readfile(): Cannot open '%s'\n",cluster_file);
exit(3);
}
while (!feof(fp)) {
// Read one row from the cluster file
for(i=0; i<prm.dim; i++) {
fscanf(fp,"%f ",&f);
// Valid the value
if (f < prm.min_val || f > prm.max_val) {
printf("HashForest::readfile(): Warning: Invalid value %f\n",f);
}
if (f > prm.max_val-1e-5) f = prm.max_val-1e-5; // ensure not exceed the max
dataset[count][i] = f;
}
count++;
}
// Check if number of row is correct
if (count!=data_row) {
printf("HashForest::readfile(): Warning: data_row incorrectly set.\n");
printf(" actual number of rows = %d\n", count);
}
fclose(fp); // Close file
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -