📄 vfkm.c
字号:
while(!done) {
e = ExampleRead(data, es);
DebugError(e == 0, "Unable to get enough unique initial centroids");
/* make sure this isn't too close to one in the list */
used = 0;
for(i = 0 ; i < VALLength(centroids) && !used ; i++) {
if(ExampleDistance(e, VALIndex(centroids, i)) <= minDistance) {
used = 1;
}
}
/* if it's ok then use it */
if(!used) {
done = 1;
} else {
ExampleFree(e);
}
}
return e;
}
static void _PickInitialCentroids(ExampleSpecPtr es, VoidAListPtr centroids,
FILE *data) {
/* pick the first unique 'gNumClusters' points from the dataset to
be centroids HERE should I add some randomness to this? */
int j;
ExamplePtr e;
char fileNames[255];
FILE *centersIn;
/* burn some examples to make the selection be more random */
for(j = RandomRange(0, 10 * gNumClusters) ; j > 0 ; j--) {
ExampleFree(ExampleRead(data, es));
}
/* if instructed, read in the initial centroids */
if(gLoadCenters) {
sprintf(fileNames, "%s.centers", gFileStem);
centersIn = fopen(fileNames, "r");
if(centersIn) {
if(gMessageLevel > 0) {
printf("Loading inital centers from %s\n", fileNames);
}
e = ExampleRead(centersIn, es);
while(e != 0) {
VALAppend(centroids, e);
e = ExampleRead(centersIn, es);
}
fclose(centersIn);
}
}
while(VALLength(centroids) < gNumClusters) {
VALAppend(centroids, _PickInitalCentroid(es, centroids, data));
}
}
//static void _OutputGoodCentroids(float bound) {
// int i;
// char fileNames[255];
// FILE *centersOut;
// IterationStatsPtr thisIs, firstIs;
// float totalError, errorThreshold;
/* write out the centroids, if we didn't get a bound only write out the
good ones, where lastBound < (100 / gNumClusers)% of the total
lastBound error */
// thisIs = VALIndex(gStatsList, VALLength(gStatsList) - 1);
// firstIs = VALIndex(gStatsList, 0);
// sprintf(fileNames, "%s.centers", gFileStem);
// centersOut = fopen(fileNames, "w");
// totalError = 0;
// for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {
// totalError += thisIs->lastBound[i];
// }
// errorThreshold = totalError / (float)gNumClusters;
//IterationStatsWrite(thisIs, stdout);
// if(gMessageLevel > 0) {
// printf("output good centers, thresh: %f\n", errorThreshold);
// }
// for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {
/* if we finished with a bound or if the centroid's error was ok */
// if((thisIs->guarenteeIDConverge && thisIs->foundBound &&
// bound <= gThisErrorTarget) ||
// (thisIs->lastBound[i] < errorThreshold)) {
// ExampleWrite(VALIndex(firstIs->centroids, i), centersOut);
// }
// }
// fclose(centersOut);
//}
static void _OutputAllCentroids(float bound) {
int i;
char fileNames[255];
FILE *centersOut;
IterationStatsPtr thisIs;
thisIs = VALIndex(gStatsList, VALLength(gStatsList) - 1);
sprintf(fileNames, "%s.centers", gFileStem);
centersOut = fopen(fileNames, "w");
for(i = 0 ; i < VALLength(thisIs->centroids) ; i++) {
ExampleWrite(VALIndex(thisIs->centroids, i), centersOut);
}
fclose(centersOut);
}
static IterationStatsPtr _FindLastIsWithBound(void) {
int i;
IterationStatsPtr is;
for(i = VALLength(gStatsList) - 1 ; i >= 0 ; i--) {
is = VALIndex(gStatsList, i);
if(is->foundBound) {
return is;
}
}
DebugWarn(1, "_FindLastIsWithBound didn't find a bound\n");
return 0;
}
static float _FindMedian(float *array, int len) {
float *errorArray = MNewPtr(sizeof(float) * len);
float tmp, median;
int i, j;
/* create the sorted error array which we'll need to find the median */
for(i = 0 ; i < len ; i++) {
errorArray[i] = array[i];
}
for(i = 0 ; i < len ; i++) {
for(j = 0 ; j < len - (i + 1) ; j++) {
if(errorArray[j] > errorArray[j + 1]) {
tmp = errorArray[j + 1];
errorArray[j + 1] = errorArray[j];
errorArray[j] = tmp;
}
}
}
if(len % 2 == 0) {
i = (len / 2) - 1;
median = (errorArray[i] + errorArray[i + 1]) / 2.0;
} else {
i = ((len + 1) / 2) - 1;
median = errorArray[i];
}
MFreePtr(errorArray);
return median;
}
VoidAListPtr _GetCentroidsForNextRound(FILE *data, ExampleSpecPtr es) {
IterationStatsPtr is = _FindLastIsWithBound();
// IterationStatsPtr is = VALIndex(gStatsList, VALLength(gStatsList) - 1);
float median = _FindMedian(is->lastBound, VALLength(is->centroids));
IterationStatsPtr iIs = VALIndex(gStatsList, 0);
VoidAListPtr newCentroids = VALNew();
int i;
//IterationStatsWrite(is, stdout);
for(i = 0 ; i < VALLength(is->centroids) ; i++) {
if(is->lastBound[i] <= median * 5) {
VALAppend(newCentroids, ExampleClone(VALIndex(iIs->centroids, i)));
//VALAppend(newCentroids, ExampleClone(VALIndex(is->centroids, i)));
} else if(gMessageLevel > 1) {
printf(" Reassigning centroid %d bound %f median %f\n",
i, is->lastBound[i], median);
fflush(stdout);
}
}
while(VALLength(newCentroids) < VALLength(is->centroids)) {
VALAppend(newCentroids, _PickInitalCentroid(es, newCentroids, data));
}
return newCentroids;
}
/* this should be static */
void CalculateExamplesPerIteration(VoidAListPtr last, float **nextNiOut, int *num);
int main(int argc, char *argv[]) {
char fileNames[255];
FILE *exampleIn;
ExampleSpecPtr es;
ExamplePtr e;
VoidListPtr centers, newCenters = 0;
float iterationNSum;
long learnTime;
int i;
int fileDone;
long nIncrement;
float lastDelta, bound;
struct tms starttime;
struct tms endtime;
IterationStatsPtr thisIs, lastIs;
int breakOut;
_processArgs(argc, argv);
if(gStdin) {
/* This is a hack because when I pipe clusterdata to vfkm
vfkm tries to read the spec before clusterdata can write it */
sleep(5);
}
sprintf(fileNames, "%s/%s.names", gSourceDirectory, gFileStem);
es = ExampleSpecRead(fileNames);
DebugError(es == 0, "Unable to open the .names file");
RandomInit();
/* seed for the concept */
if(gSeed != -1) {
RandomSeed(gSeed);
} else {
gSeed = RandomRange(1, 30000);
RandomSeed(gSeed);
}
if(gMessageLevel > 0) {
printf("running with seed %d\n", gSeed);
}
/* initialize some globals */
gStatsList = VALNew();
gD = ExampleSpecGetNumAttributes(es);
if(gR == 0) {
gR = sqrt(gD);
}
if(!gAllowBadConverge) {
/* use a tighter bound so we get a better convergence behavior */
gThisErrorTarget = min(gErrorTarget, gConvergeDelta / 3.0);
} else {
gThisErrorTarget = min(gErrorTarget, gConvergeDelta);
}
gNeededDelta = 1.0 - pow(1.0 - gDelta, 1.0 /
(float)(gD * gNumClusters * gEstimatedNumIterations));
gTargetEkd = sqrt(gThisErrorTarget / ((float)gNumClusters * (float)gD));
gN = (gNumClusters / 2.0) * pow(1.0/gTargetEkd, 2) *
log(2.0/gNeededDelta) * 1.1;
//if(gUseNormalApprox) {
// /* hueristic to correct for normal being tighter */
// gN /= 5;
//}
if(gMessageLevel > 1) {
printf("Target Ekd: %.3lf\n", gTargetEkd);
}
if(gStdin) {
exampleIn = stdin;
} else {
/* Pick the initial centroids from the dataset */
/* in stream mode we never close & reopen the file */
sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);
exampleIn = fopen(fileNames, "r");
DebugError(exampleIn == 0, "Unable to open the .data file");
}
centers = VALNew();
/* try get the gInitNumber-ith batch of centroids */
for(i = 0 ; i < gInitNumber ; i++) {
while(VALLength(centers) > 0) {
ExampleFree(VALRemove(centers, VALLength(centers) - 1));
}
_PickInitialCentroids(es, centers, exampleIn);
}
if(!gStdin) {
/* in stream mode we never close & reopen the file */
fclose(exampleIn);
}
/* create the initial stats structure from the initial centroids */
VALAppend(gStatsList, IterationStatsInitial(centers));
if(gMessageLevel >= 1) {
printf("allocation %ld\n", MGetTotalAllocation());
}
learnTime = 0;
gIteration = 0;
gRound = 0;
times(&starttime);
do {
gRound++;
/* if we aren't on the first round free the cruft from the prvious one */
if(gRound > 1) {
if(gReassignCentersEachRound) {
if(!gStdin) {
/* in stream mode we never close & reopen the file */
sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);
exampleIn = fopen(fileNames, "r");
DebugError(exampleIn == 0, "Unable to open the .data file");
}
/* burn some examples to make the selection be more random */
for(i = RandomRange(0, 10 * gNumClusters) ; i > 0 ; i--) {
ExampleFree(ExampleRead(exampleIn, es));
}
/* HERE MEM this newCenters may leak memory */
newCenters = _GetCentroidsForNextRound(exampleIn, es);
}
for(i = VALLength(gStatsList) - 1 ; i >= 0 ; i--) {
IterationStatsFree(VALRemove(gStatsList, i));
}
if(gReassignCentersEachRound) {
VALAppend(gStatsList, IterationStatsInitial(newCenters));
} else {
/* use the inital centroids again */
VALAppend(gStatsList, IterationStatsInitial(centers));
}
}
gIteration = 1;
if(!gStdin) {
/* in stream mode we never close & reopen the file */
sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);
exampleIn = fopen(fileNames, "r");
DebugError(exampleIn == 0, "Unable to open the .data file");
}
if(gMessageLevel > 0) {
printf("========================\n");
printf("round %d n %ld\n", gRound, gN);
}
while(!_DoClusterIterationDidConverge(exampleIn, es)) {
times(&endtime);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -