📄 vfkm.c
字号:
float loss;
float bound;
int foundBound;
long tested = 0;
foundBound = 0;
if(((IterationStatsPtr)VALIndex(gStatsList,
VALLength(gStatsList) - 1))->foundBound) {
if(((IterationStatsPtr)VALIndex(gStatsList,
VALLength(gStatsList) - 1))->guarenteeIDConverge) {
foundBound = 1;
} else if(((IterationStatsPtr)VALIndex(gStatsList,
VALLength(gStatsList) - 1))->wouldKMConverge &&
gAllowBadConverge) {
if(gMessageLevel >= 1) {
printf("found a bad converge bound.\n");
}
foundBound = 1;
}
}
if(foundBound) {
bound = _CalculateErrorBound();
} else {
bound = -1;
}
if(!gTestOnTrain) {
/* just output the distance between matched centers */
/* load the test centers */
testCenters = VALNew();
sprintf(fileNames, "%s/%s.test", gSourceDirectory, gFileStem);
testCentersIn = fopen(fileNames, "r");
DebugError(testCentersIn == 0, "Unable to open the .test file");
if(gMessageLevel >= 1) {
printf("reading the test centers file...\n");
}
tc = ExampleRead(testCentersIn, es);
while(tc != 0) {
VALAppend(testCenters, tc);
tc = ExampleRead(testCentersIn, es);
}
fclose(testCentersIn);
/* Match learned centers with the test centers */
loss = _MatchCentersGetDistanceSquare(learnedCenters, testCenters);
/* free the test centers */
for(i = 0 ; i < VALLength(testCenters) ; i++) {
ExampleFree(VALIndex(testCenters, i));
}
VALFree(testCenters);
} else { /* Sum Square distance of example to assigned cluster */
loss = 0;
sprintf(fileNames, "%s/%s.data", gSourceDirectory, gFileStem);
exampleIn = fopen(fileNames, "r");
DebugError(exampleIn == 0, "Unable to open the .data file");
if(gMessageLevel >= 1) {
printf("opened test file, starting scan...\n");
}
e = ExampleRead(exampleIn, es);
while(e != 0 && tested < gMaxExamplesPerIteration) {
tested++;
lc = _FindClosestCenter(e, learnedCenters);
loss += pow(ExampleDistance(e, lc), 2);
ExampleFree(e);
e = ExampleRead(exampleIn, es);
}
if(e != 0) {
ExampleFree(e);
}
fclose(exampleIn);
}
if(finalOutput) {
printf("%.4f\t0\n", loss);
} else {
if(foundBound) {
printf("%d\t%ld\t%d\t%.6f\t%.6f\t%.2lf\n",
gRound, learnCount, gTotalExamplesSeen,
bound, loss, ((double)learnTime) / 100);
} else {
if(gMessageLevel > 1) {
printf(" No bound, Current bound estimate is %f guarenteed converge %d\n",
_CalculateErrorBound(),
((IterationStatsPtr)VALIndex(gStatsList,
VALLength(gStatsList) - 1))->guarenteeIDConverge);
}
printf("%d\t%ld\t%d\t***\t%.6f\t%.2lf\n",
gRound, learnCount, gTotalExamplesSeen,
loss, ((double)learnTime) / 100);
}
}
fflush(stdout);
if(0) {//gOutputCenters) {
sprintf(fileNames, "%s-%lu.centers", gFileStem, learnCount);
centersOut = fopen(fileNames, "w");
for(i = 0 ; i < VALLength(learnedCenters) ; i++) {
ExampleWrite(VALIndex(learnedCenters, i), centersOut);
// ExampleWrite(VALIndex(learnedCenters, i), stdout);
}
// printf("------------------\n");
fclose(centersOut);
}
}
static int _CheckConverganceUpdateStats(IterationStatsPtr last,
IterationStatsPtr current) {
float thisDistance;
float bound, lowerBound, upperBound, clusterBound;
float error;
ExamplePtr eThis, eLast;
int i, j;
bound = 0;
lowerBound = 0;
upperBound = 0;
for(i = 0 ; i < VALLength(last->centroids) ; i++) {
eLast = VALIndex(last->centroids, i);
eThis = VALIndex(current->centroids, i);
clusterBound = 0;
for(j = 0 ; j < ExampleGetNumAttributes(eThis) ; j++) {
/* HERE fix for discrete */
thisDistance = ExampleGetContinuousAttributeValue(eLast, j) -
ExampleGetContinuousAttributeValue(eThis, j);
if(thisDistance < 0) {
thisDistance *= -1;
}
error = last->errorBound[i][j] + current->errorBound[i][j];
bound += pow(thisDistance, 2);
clusterBound += pow(thisDistance, 2);
lowerBound += pow(max(thisDistance - error, 0), 2);
upperBound += pow(thisDistance + error, 2);
if(gMessageLevel > 2) {
printf("e: %f bnd %f cbnd %f lbnd %f ubnd %f\n",
error, bound, clusterBound, lowerBound, upperBound);
}
}
if(gMessageLevel > 0) {
printf(" cluster %d moved by %f\n", i, clusterBound);
}
}
if(gMessageLevel > 0) {
printf(" clusters moved [ %f - %f - %f ]\n",
lowerBound, bound, upperBound);
}
if(bound <= gConvergeDelta / 3.0) {
current->convergeVFKM = 1;
}
if(lowerBound <= gConvergeDelta) {
current->possibleIDConverge = 1;
if(bound <= gConvergeDelta) {
current->wouldKMConverge = 1;
}
if(upperBound <= gConvergeDelta) {
current->guarenteeIDConverge = 1;
} else if(gMessageLevel > 0) {
printf(" IDKM may have or may not have converged.\n");
}
}
if(gBatch || gAllowBadConverge) {
if(gMessageLevel > 0 && gAllowBadConverge && current->wouldKMConverge) {
printf(" found a potentially bad converge.\n");
}
return current->wouldKMConverge;
} else {
return current->guarenteeIDConverge ||
(current->convergeVFKM && last->convergeVFKM);
}
}
static int _DoClusterIterationDidConverge(FILE *data, ExampleSpecPtr es) {
int i,j;
ExamplePtr e, centroid;
int centerIndex;
long seen = 0;
int done;
int nPlus;
IterationStatsPtr is, newIs;
float delta;
float *plus, *minus, *distances;
double *sums;
is = VALIndex(gStatsList, VALLength(gStatsList) - 1);
distances = MNewPtr(sizeof(float) * gNumClusters);
if(gMessageLevel > 1) {
printf("enter iteration %d seen %d\n", gIteration, gTotalExamplesSeen);
fflush(stdout);
}
done = 0;
e = ExampleRead(data, es);
while(e != 0 && !done ) {
seen++;
is->n++;
gTotalExamplesSeen++;
/* find the distance to all the clusters */
for(i = 0 ; i < VALLength(is->centroids) ; i++) {
distances[i] = ExampleDistance(e, VLIndex(is->centroids, i));
}
/* find the nearest cluster center & update counts*/
centerIndex = 0;
for(i = 1 ; i < VALLength(is->centroids) ; i++) {
if(distances[i] < distances[centerIndex]) {
centerIndex = i;
}
}
(is->nHat[centerIndex])++;
sums = is->wonSum[centerIndex];
for(i = 0 ; i < ExampleSpecGetNumAttributes(es) ; i++) {
if(!ExampleIsAttributeUnknown(e, i)) {
if(ExampleIsAttributeContinuous(e, i)) {
is->xMaxSquareSum[centerIndex][i] +=
pow(ExampleGetContinuousAttributeValue(e, i), 2);
is->xMinSum[centerIndex][i] +=
ExampleGetContinuousAttributeValue(e, i);
sums[i] += ExampleGetContinuousAttributeValue(e, i);
} else {
/* HERE what to do about discrete attributes */
}
}
}
/* find all other possible winners and update counts */
nPlus = 0;
for(i = 0 ; i < VALLength(is->centroids) ; i++) {
if(i != centerIndex) {
if(distances[i] < distances[centerIndex] +
(is->lastBound[i] + is->lastBound[centerIndex])) {
/* here is a point that someone else might win */
nPlus = 1;
centroid = VALIndex(is->centroids, i);
plus = is->deltaPlus[i];
minus = is->deltaMinus[i];
(is->nMinus[i])++;
for(j = 0 ; j < ExampleSpecGetNumAttributes(es) ; j++) {
if(!ExampleIsAttributeUnknown(e, j)) {
if(ExampleIsAttributeContinuous(e, j)) {
is->xMaxSquareSum[i][j] +=
pow(ExampleGetContinuousAttributeValue(e, j), 2);
delta = ExampleGetContinuousAttributeValue(e, j) -
ExampleGetContinuousAttributeValue(centroid, j);
if(delta > 0) {
plus[j] += delta;
} else {
minus[j] += -delta;
}
} else {
/* HERE what to do about discrete attributes */
}
}
}
}
}
}
/* if someone else might have won then update winner's nPlus */
if(nPlus) {
centroid = VALIndex(is->centroids, centerIndex);
plus = is->deltaPlus[centerIndex];
minus = is->deltaMinus[centerIndex];
(is->nPlus[centerIndex])++;
for(j = 0 ; j < ExampleSpecGetNumAttributes(es) ; j++) {
if(!ExampleIsAttributeUnknown(e, j)) {
if(ExampleIsAttributeContinuous(e, j)) {
is->xMinSum[centerIndex][j] -=
ExampleGetContinuousAttributeValue(e, j);
delta = ExampleGetContinuousAttributeValue(e, j) -
ExampleGetContinuousAttributeValue(centroid, j);
if(delta < 0) {
plus[j] += -delta;
} else {
minus[j] += delta;
}
} else {
/* HERE what to do about discrete attributes */
}
}
}
}
ExampleFree(e);
/* check to see if we should move to the next iteration */
if(!gBatch && gFancyStop && gIteration <= gNumIterationNs) {
/* Test to see if this iteration is done */
if(seen >= gIterationNs[gIteration - 1]) {
done = 1;
}
} else if(!gBatch && seen >= gN) {
done = 1;
} else if(seen > gMaxExamplesPerIteration) {
done = 1;
}
if(!done) {
/* if we didn't get stopped by the termination check get another */
e = ExampleRead(data, es);
}
}
MFreePtr(distances);
if(gMessageLevel > 1) {
IterationStatsWrite(is, stdout);
}
newIs = IterationStatsNext(is, gNeededDelta, 1.0, gAssignErrorScale, es);
VALAppend(gStatsList, newIs);
if(gMessageLevel > 1) {
printf("exit iteration %d seen %d\n", gIteration, gTotalExamplesSeen);
fflush(stdout);
}
if(newIs) {
return _CheckConverganceUpdateStats(is, newIs);
} else {
/* we didn't converge, but this round will be stoped by
the foundBound of 0 */
return 0;
}
}
static ExamplePtr _PickInitalCentroid(ExampleSpecPtr es, VoidAListPtr centroids, FILE *data) {
float minDistance;
int done = 0;
ExamplePtr e;
int used;
int i;
//minDistance = gR / ((float)gNumClusters * 4);
minDistance = gR / ((float)gNumClusters * 2);
//minDistance = 0.00001;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -