📄 vfkm.c
字号:
#include "vfml.h"
#include "vfkm-engine.h"
#include <stdio.h>
#include <string.h>
#include <sys/times.h>
#include <time.h>
#include <math.h>
char *gFileStem = "DF";
char *gSourceDirectory = ".";
int gDoTests = 0;
int gMessageLevel = 0;
int gNumClusters = -1;
float gDelta = 0.05;
float gErrorTarget = 0.05;
float gThisErrorTarget = 0.05;
int gEstimatedNumIterations = 5;
int gBatch = 0;
long gMaxExamplesPerIteration = 1000000000;
int gInitNumber = 1;
int gFancyStop = 1;
float gConvergeDelta = 0.001;
int gSeed = -1;
float gR = 0;
float gAssignErrorScale = 1.0;
int gAllowBadConverge = 0;
int gUseNormalApprox = 0;
long gDBSize = 0;
int gStdin = 0;
int gIterativeResults = 0;
int gTestOnTrain = 0; /* default test total center-matching distance */
/* HERE add parameters for these */
int gDoEarlyStop = 0;
int gLoadCenters = 0;
int gReassignCentersEachRound = 0;
/* HERE some hackish globals & stuff*/
VoidAListPtr gStatsList;
int gD;
int gIteration = 0;
int gRound = 0;
int gTotalExamplesSeen = 0;
float gNeededDelta;
float gTargetEkd;
long gN;
float *gIterationNs = 0;
int gNumIterationNs;
static void _printUsage(char *name) {
printf("%s : Very Fast k-means clustering\n", name);
printf("-f <filestem>\t Set the name of the dataset (default DF)\n");
printf("-source <dir>\t Set the source data directory (default '.')\n");
printf("-clusters <numClusters>\t Set the num clusters to find (REQUIRED)\n");
printf("-dbSize <examples>\t How many examples are in the datafile (REQUIRED)\n");
printf("-assignErrorScale <scale>\t Loosen bound by scaling the assignment part\n\t\t of it by the scale factor (default 1.0)\n");
printf("-delta <delta>\t Find final solution with confidence (1 - <delta>) (default 5%%)\n");
printf("-epsilon <epsilon>\t The maximum distance between a learned centroid and\n\t\t the coresponding infinite data centroid (default .05)\n");
printf("-converge <distance>\t Stop when distance centroids move between iterations\n\t\t squared is less than <distance> (default .001)\n");
printf("-batch\t Do a traditional kmeans run on the whole input file. Doesn't make\n\t\t sense to combine with -stdin (default off)\n");
printf("-init <num>\t use the <num>th valid assignment of initial centroids\n\t\t (default 1)\n");
printf("-maxPerIteration <num> \t use no more than num examples per iteration\n\t\t (default 1,000,000,000)\n");
printf("-l <number>\t The estimated # of iterations to converge if you guess\n\t\t wrong and aren't using batch VFKM will fix it for you\n\t\t and do an extra round (default 10)\n");
printf("-seed <s>\t Seed to use picking initial centers (default random)\n");
printf("-progressive\t Don't use our fancy Lagrange based optimization pick the\n\t\t # of samples at each iteration of the next \n\t\tround (default use the optimization)\n");
printf("-allowBadConverge\t Allows VFKM to converge at a time when km would\n\t\tnot (default off).\n");
printf("-normalApprox\t use a normal approximation instead of the hoeffding bound\n\t\t (default hoeffding).\n");
printf("-r <range>\t The maximum range between pairs of examples (default\n\t\t assume the range of each dimension is 0 - 1.0\n\t\t and calculate it from that)\n");
printf("-stdin \t\t Reads training examples as a stream from stdin instead of from \n\t\t <stem>.data (default off)\n");
printf("-testOnTrain \t loss is the sum of the square of the distances \n\t\t from all training examples to their assigned centroid (default\n\t\t compare learned centroids to centroids in <stem>.test)\n");
printf("-loadCenters \t load initial centroids from <stem>.centers (default use random based on -init and -seed)\n");
printf("-u\t\t Output results after each iteration\n");
printf("-v\t\t Can be used multiple times to increase the debugging output\n");
}
static void _processArgs(int argc, char *argv[]) {
int i;
/* HERE on the ones that use the next arg make sure it is there */
for(i = 1 ; i < argc ; i++) {
if(!strcmp(argv[i], "-f")) {
gFileStem = argv[i+1];
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-source")) {
gSourceDirectory = argv[i+1];
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-u")) {
gDoTests = 1;
} else if(!strcmp(argv[i], "-testOnTrain")) {
gTestOnTrain = 1;
} else if(!strcmp(argv[i], "-clusters")) {
sscanf(argv[i+1], "%d", &gNumClusters);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-dbSize")) {
sscanf(argv[i+1], "%ld", &gDBSize);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-delta")) {
sscanf(argv[i+1], "%f", &gDelta);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-converge")) {
sscanf(argv[i+1], "%f", &gConvergeDelta);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-r")) {
sscanf(argv[i+1], "%f", &gR);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-assignErrorScale")) {
sscanf(argv[i+1], "%f", &gAssignErrorScale);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-epsilon")) {
sscanf(argv[i+1], "%f", &gErrorTarget);
gThisErrorTarget = gErrorTarget;
/* ignore the next argument */
i++;
// } else if(!strcmp(argv[i], "-n")) {
// sscanf(argv[i+1], "%ld", &gN);
// /* ignore the next argument */
// i++;
} else if(!strcmp(argv[i], "-maxPerIteration")) {
sscanf(argv[i+1], "%ld", &gMaxExamplesPerIteration);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-loadCenters")) {
gLoadCenters = 1;
} else if(!strcmp(argv[i], "-batch")) {
gBatch = 1;
} else if(!strcmp(argv[i], "-allowBadConverge")) {
gAllowBadConverge = 1;
} else if(!strcmp(argv[i], "-normalApprox")) {
gUseNormalApprox = 1;
} else if(!strcmp(argv[i], "-progressive")) {
gFancyStop = 0;
} else if(!strcmp(argv[i], "-init")) {
sscanf(argv[i+1], "%d", &gInitNumber);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-l")) {
sscanf(argv[i+1], "%d", &gEstimatedNumIterations);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-seed")) {
sscanf(argv[i+1], "%d", &gSeed);
/* ignore the next argument */
i++;
} else if(!strcmp(argv[i], "-v")) {
gMessageLevel++;
} else if(!strcmp(argv[i], "-h")) {
_printUsage(argv[0]);
exit(0);
} else if(!strcmp(argv[i], "-stdin")) {
gStdin = 1;
} else {
printf("Unknown argument: %s. use -h for help\n", argv[i]);
exit(0);
}
}
if(gNumClusters < 0) {
printf("Didn't supply the required '-clusters' argument\n");
exit(0);
}
if(gDBSize <= 0) {
printf("Didn't supply the required '-dbSize' argument\n");
exit(0);
}
if(gMessageLevel >= 1) {
printf("Stem: %s\n", gFileStem);
printf("Source: %s\n", gSourceDirectory);
if(gStdin) {
printf("Reading examples from standard in.\n");
}
if(gDoTests) {
printf("Running tests\n");
}
printf("DB Size %ld\n", gDBSize);
}
}
static float _MatchCentersGetDistanceSquare(VoidAListPtr learnedCenters, VoidAListPtr testCenters) {
/* perfom a 1 to 1 matching between the learned and test centers.
change the class lables of the learnedCenters to correspond to
the closest testCenters. Do a greedy assignment that tries to
minimize the total distance between assigned pairs */
int i, j, init;
double bestDistance, thisDistance;
int lBestIndex, tBestIndex;
VoidAListPtr unLearned = VALNew();
VoidAListPtr unTest = VALNew();
float totalDistance = 0.0;
for(i = 0 ; i < VALLength(learnedCenters) ; i++) {
VALAppend(unLearned, VALIndex(learnedCenters, i));
}
for(i = 0 ; i < VALLength(testCenters) ; i++) {
VALAppend(unTest, VALIndex(testCenters, i));
}
bestDistance = lBestIndex = tBestIndex = 0;
while(VALLength(unLearned) > 0) {
init = 0;
for(i = 0 ; i < VALLength(unLearned) ; i++) {
for(j = 0 ; j < VALLength(unTest) ; j++) {
thisDistance = ExampleDistance(VALIndex(unLearned, i),
VALIndex(unTest, j));
if(thisDistance < bestDistance || !init) {
init = 1;
bestDistance = thisDistance;
lBestIndex = i;
tBestIndex = j;
}
}
}
if(gMessageLevel > 2) {
printf("\tMatched cluster %d distance %lf\n", ExampleGetClass(VALIndex(unTest, tBestIndex)), ExampleDistance(VALIndex(unLearned, lBestIndex), VALIndex(unTest, tBestIndex)));
}
totalDistance += bestDistance * bestDistance;
ExampleSetClass(VALRemove(unLearned, lBestIndex),
ExampleGetClass(VALRemove(unTest, tBestIndex)));
}
VALFree(unLearned);
VALFree(unTest);
return totalDistance;
}
static int _FindClosestCenterIndex(ExamplePtr e, VoidAListPtr centers) {
int i;
double bestDistance, thisDistance;
int bestIndex;
bestIndex = 0;
bestDistance = ExampleDistance(e, VLIndex(centers, 0));
for(i = 1 ; i < VLLength(centers) ; i++) {
thisDistance = ExampleDistance(e, VLIndex(centers, i));
if(thisDistance < bestDistance) {
bestDistance = thisDistance;
bestIndex = i;
}
}
return bestIndex;
}
static ExamplePtr _FindClosestCenter(ExamplePtr e, VoidAListPtr centers) {
return VALIndex(centers, _FindClosestCenterIndex(e, centers));
}
static float _CalculateIDKMEarlyBound(void) {
int i, j, k;
float maxError, thisError, bound, cError;
IterationStatsPtr isL, isI;
ExamplePtr eL, eI;
isL = VALIndex(gStatsList, VALLength(gStatsList) - 1);
maxError = 0;
for(i = 0 ; i < VALLength(gStatsList) ; i++) {
isI = VALIndex(gStatsList, i);
if(isI->possibleIDConverge) {
thisError = 0;
for(j = 0 ; j < VALLength(isI->centroids) ; j++) {
eL = VALIndex(isL->centroids, j);
eI = VALIndex(isI->centroids, j);
cError = 0;
for(k = 0 ; k < ExampleGetNumAttributes(eL) ; k++) {
/* HERE fix for discrete */
bound = ExampleGetContinuousAttributeValue(eL, k) -
ExampleGetContinuousAttributeValue(eI, k) +
isI->errorBound[j][k];
cError += pow(bound, 2);
}
thisError += cError;
//printf("i%d c%d cError %f totalError %f\n",
// i, j, cError, thisError);
}
if(thisError > maxError) {
maxError = thisError;
}
}
}
return maxError;
}
static float _CalculateErrorBound(void) {
int i;
float maxError = 0;
float earlyError;
IterationStatsPtr isL;
isL = VALIndex(gStatsList, VALLength(gStatsList) - 1);
/* find the error from the final iteration */
for(i = 0 ; i < VALLength(isL->centroids) ; i++) {
if(gMessageLevel > 2) {
printf("max ekd %.4f max error %.4f error%d %.4f error%d^2 %.4f\n",
isL->maxEkd, maxError, i, isL->lastBound[i], i,
pow(isL->lastBound[i], 2));
}
maxError += pow(isL->lastBound[i], 2);
}
if(!gAllowBadConverge) {
earlyError = _CalculateIDKMEarlyBound();
if(earlyError > maxError) {
if(gMessageLevel > 1) {
printf("early stop error (%.5f) greater than El (%.5f)\n",
earlyError, maxError);
}
maxError = earlyError;
}
}
return maxError;
}
static int _WhenKMConverged() {
int i;
IterationStatsPtr is;
for(i = 0 ; i < VLLength(gStatsList) ; i++) {
is = VLIndex(gStatsList, i);
if(is->wouldKMConverge) {
return i;
}
}
return VLLength(gStatsList) - 1;
}
static void _doTests(ExampleSpecPtr es, VoidAListPtr learnedCenters, long learnCount, long learnTime, int finalOutput) {
char fileNames[255];
FILE *exampleIn, *testCentersIn, *centersOut;
ExamplePtr e, tc, lc;
VoidAListPtr testCenters;
long i;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -